From 1c13975ec5dc7f6641511f7d26c9b802cd8ed628 Mon Sep 17 00:00:00 2001
From: Wenxi Onyx <wenxi@onyx.app>
Date: Thu, 17 Jul 2025 14:53:31 -0700
Subject: [PATCH 01/78] minor internet search env vars

---
 deployment/docker_compose/docker-compose.gpu-dev.yml | 1 +
 deployment/docker_compose/env.prod.template          | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/deployment/docker_compose/docker-compose.gpu-dev.yml b/deployment/docker_compose/docker-compose.gpu-dev.yml
index 554ad83e3f8..1b0c1078e5d 100644
--- a/deployment/docker_compose/docker-compose.gpu-dev.yml
+++ b/deployment/docker_compose/docker-compose.gpu-dev.yml
@@ -145,6 +145,7 @@ services:
       - GENERATIVE_MODEL_ACCESS_CHECK_FREQ=${GENERATIVE_MODEL_ACCESS_CHECK_FREQ:-}
       - DISABLE_LITELLM_STREAMING=${DISABLE_LITELLM_STREAMING:-}
       - LITELLM_EXTRA_HEADERS=${LITELLM_EXTRA_HEADERS:-}
+      - EXA_API_KEY=${EXA_API_KEY:-}
       # Query Options
       - DOC_TIME_DECAY=${DOC_TIME_DECAY:-} # Recency Bias for search results, decay at 1 / (1 + DOC_TIME_DECAY * x years)
       - HYBRID_ALPHA=${HYBRID_ALPHA:-} # Hybrid Search Alpha (0 for entirely keyword, 1 for entirely vector)
diff --git a/deployment/docker_compose/env.prod.template b/deployment/docker_compose/env.prod.template
index cf36379468d..a4e8856dfe1 100644
--- a/deployment/docker_compose/env.prod.template
+++ b/deployment/docker_compose/env.prod.template
@@ -8,7 +8,7 @@ WEB_DOMAIN=http://localhost:3000
 
 
 # NOTE: Generative AI configurations are done via the UI now
-
+EXA_API_KEY=
 
 # The following are for configuring User Authentication, supported flows are:
 # disabled

From 73258f26ea769c18f7993618ec84c0a863dc29bb Mon Sep 17 00:00:00 2001
From: Wenxi Onyx <wenxi@onyx.app>
Date: Thu, 17 Jul 2025 16:20:50 -0700
Subject: [PATCH 02/78] clean up connector page and add new option for uncommon
 connectors

---
 backend/onyx/configs/app_configs.py           |  3 +
 backend/onyx/configs/constants.py             |  3 -
 backend/onyx/server/settings/models.py        |  3 +
 backend/onyx/server/settings/store.py         |  4 ++
 .../docker_compose/docker-compose.dev.yml     |  1 +
 .../docker_compose/docker-compose.gpu-dev.yml |  1 +
 .../docker-compose.multitenant-dev.yml        |  1 +
 deployment/docker_compose/env.prod.template   |  4 ++
 web/src/app/admin/add-connector/page.tsx      | 57 ++++++++++---------
 web/src/app/admin/settings/interfaces.ts      |  3 +
 web/src/lib/search/interfaces.ts              |  6 +-
 web/src/lib/sources.ts                        | 17 +++---
 12 files changed, 61 insertions(+), 42 deletions(-)

diff --git a/backend/onyx/configs/app_configs.py b/backend/onyx/configs/app_configs.py
index 2c538950186..047e49d7d8b 100644
--- a/backend/onyx/configs/app_configs.py
+++ b/backend/onyx/configs/app_configs.py
@@ -38,6 +38,9 @@
 # Controls whether users can use User Knowledge (personal documents) in assistants
 DISABLE_USER_KNOWLEDGE = os.environ.get("DISABLE_USER_KNOWLEDGE", "").lower() == "true"
 
+# If set to true, will show extra/uncommon connectors in the "Other" category
+SHOW_EXTRA_CONNECTORS = os.environ.get("SHOW_EXTRA_CONNECTORS", "").lower() == "true"
+
 # Controls whether to allow admin query history reports with:
 # 1. associated user emails
 # 2. anonymized user emails
diff --git a/backend/onyx/configs/constants.py b/backend/onyx/configs/constants.py
index 18bfb61496e..6f1a8f8157c 100644
--- a/backend/onyx/configs/constants.py
+++ b/backend/onyx/configs/constants.py
@@ -216,9 +216,6 @@ class BlobType(str, Enum):
     GOOGLE_CLOUD_STORAGE = "google_cloud_storage"
     OCI_STORAGE = "oci_storage"
 
-    # Special case, for internet search
-    NOT_APPLICABLE = "not_applicable"
-
 
 class DocumentIndexType(str, Enum):
     COMBINED = "combined"  # Vespa
diff --git a/backend/onyx/server/settings/models.py b/backend/onyx/server/settings/models.py
index 9368ed91e50..90a1f7a7143 100644
--- a/backend/onyx/server/settings/models.py
+++ b/backend/onyx/server/settings/models.py
@@ -62,6 +62,9 @@ class Settings(BaseModel):
     # User Knowledge settings
     user_knowledge_enabled: bool | None = True
 
+    # Connector settings
+    show_extra_connectors: bool | None = True
+
 
 class UserSettings(Settings):
     notifications: list[Notification]
diff --git a/backend/onyx/server/settings/store.py b/backend/onyx/server/settings/store.py
index a1dc319ed35..6e32e22c16c 100644
--- a/backend/onyx/server/settings/store.py
+++ b/backend/onyx/server/settings/store.py
@@ -1,5 +1,6 @@
 from onyx.configs.app_configs import DISABLE_USER_KNOWLEDGE
 from onyx.configs.app_configs import ONYX_QUERY_HISTORY_TYPE
+from onyx.configs.app_configs import SHOW_EXTRA_CONNECTORS
 from onyx.configs.constants import KV_SETTINGS_KEY
 from onyx.configs.constants import OnyxRedisLocks
 from onyx.key_value_store.factory import get_kv_store
@@ -53,6 +54,9 @@ def load_settings() -> Settings:
     if DISABLE_USER_KNOWLEDGE:
         settings.user_knowledge_enabled = False
 
+    # Override show extra connectors setting based on environment variable
+    settings.show_extra_connectors = SHOW_EXTRA_CONNECTORS
+
     return settings
 
 
diff --git a/deployment/docker_compose/docker-compose.dev.yml b/deployment/docker_compose/docker-compose.dev.yml
index 643b0535a20..908479af8da 100644
--- a/deployment/docker_compose/docker-compose.dev.yml
+++ b/deployment/docker_compose/docker-compose.dev.yml
@@ -233,6 +233,7 @@ services:
       - GITHUB_CONNECTOR_BASE_URL=${GITHUB_CONNECTOR_BASE_URL:-}
       - MAX_DOCUMENT_CHARS=${MAX_DOCUMENT_CHARS:-}
       - MAX_FILE_SIZE_BYTES=${MAX_FILE_SIZE_BYTES:-}
+      - SHOW_EXTRA_CONNECTORS=${SHOW_EXTRA_CONNECTORS:-}
       # Egnyte OAuth Configs
       - EGNYTE_CLIENT_ID=${EGNYTE_CLIENT_ID:-}
       - EGNYTE_CLIENT_SECRET=${EGNYTE_CLIENT_SECRET:-}
diff --git a/deployment/docker_compose/docker-compose.gpu-dev.yml b/deployment/docker_compose/docker-compose.gpu-dev.yml
index 1b0c1078e5d..46f6b8de001 100644
--- a/deployment/docker_compose/docker-compose.gpu-dev.yml
+++ b/deployment/docker_compose/docker-compose.gpu-dev.yml
@@ -192,6 +192,7 @@ services:
       - GONG_CONNECTOR_START_TIME=${GONG_CONNECTOR_START_TIME:-}
       - NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP=${NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP:-}
       - GITHUB_CONNECTOR_BASE_URL=${GITHUB_CONNECTOR_BASE_URL:-}
+      - SHOW_EXTRA_CONNECTORS=${SHOW_EXTRA_CONNECTORS:-}
       # Onyx SlackBot Configs
       - DANSWER_BOT_DISABLE_DOCS_ONLY_ANSWER=${DANSWER_BOT_DISABLE_DOCS_ONLY_ANSWER:-}
       - DANSWER_BOT_FEEDBACK_VISIBILITY=${DANSWER_BOT_FEEDBACK_VISIBILITY:-}
diff --git a/deployment/docker_compose/docker-compose.multitenant-dev.yml b/deployment/docker_compose/docker-compose.multitenant-dev.yml
index 98db1c5da6f..6dd60186e59 100644
--- a/deployment/docker_compose/docker-compose.multitenant-dev.yml
+++ b/deployment/docker_compose/docker-compose.multitenant-dev.yml
@@ -214,6 +214,7 @@ services:
       - GITHUB_CONNECTOR_BASE_URL=${GITHUB_CONNECTOR_BASE_URL:-}
       - MAX_DOCUMENT_CHARS=${MAX_DOCUMENT_CHARS:-}
       - MAX_FILE_SIZE_BYTES=${MAX_FILE_SIZE_BYTES:-}
+      - SHOW_EXTRA_CONNECTORS=${SHOW_EXTRA_CONNECTORS:-}
       # Egnyte OAuth Configs
       - EGNYTE_CLIENT_ID=${EGNYTE_CLIENT_ID:-}
       - EGNYTE_CLIENT_SECRET=${EGNYTE_CLIENT_SECRET:-}
diff --git a/deployment/docker_compose/env.prod.template b/deployment/docker_compose/env.prod.template
index a4e8856dfe1..f8800173aec 100644
--- a/deployment/docker_compose/env.prod.template
+++ b/deployment/docker_compose/env.prod.template
@@ -65,3 +65,7 @@ DB_READONLY_PASSWORD=password
 # If setting the vespa language is required, set this ('en', 'de', etc.).
 # See: https://docs.vespa.ai/en/linguistics.html 
 #VESPA_LANGUAGE_OVERRIDE=
+
+# Uncommon connectors supported by the community
+# See https://docs.onyx.app for list of these connectors
+SHOW_EXTRA_CONNECTORS=False
\ No newline at end of file
diff --git a/web/src/app/admin/add-connector/page.tsx b/web/src/app/admin/add-connector/page.tsx
index 91dccea0bab..1c75372b273 100644
--- a/web/src/app/admin/add-connector/page.tsx
+++ b/web/src/app/admin/add-connector/page.tsx
@@ -7,7 +7,14 @@ import { listSourceMetadata } from "@/lib/sources";
 import Title from "@/components/ui/title";
 import { Button } from "@/components/ui/button";
 import Link from "next/link";
-import { useCallback, useEffect, useMemo, useRef, useState } from "react";
+import {
+  useCallback,
+  useContext,
+  useEffect,
+  useMemo,
+  useRef,
+  useState,
+} from "react";
 import {
   Tooltip,
   TooltipContent,
@@ -24,6 +31,7 @@ import useSWR from "swr";
 import { errorHandlingFetcher } from "@/lib/fetcher";
 import { buildSimilarCredentialInfoURL } from "@/app/admin/connector/[ccPairId]/lib";
 import { Credential } from "@/lib/connectors/credentials";
+import { SettingsContext } from "@/components/settings/SettingsProvider";
 
 function SourceTile({
   sourceMetadata,
@@ -148,6 +156,7 @@ export default function Page() {
   const sources = useMemo(() => listSourceMetadata(), []);
   const [searchTerm, setSearchTerm] = useState("");
   const { data: federatedConnectors } = useFederatedConnectors();
+  const settings = useContext(SettingsContext);
 
   // Fetch Slack credentials to determine navigation behavior
   const { data: slackCredentials } = useSWR<Credential<any>[]>(
@@ -177,7 +186,7 @@ export default function Page() {
 
   const categorizedSources = useMemo(() => {
     const filtered = filterSources(sources);
-    return Object.values(SourceCategory).reduce(
+    const categories = Object.values(SourceCategory).reduce(
       (acc, category) => {
         acc[category] = sources.filter(
           (source) =>
@@ -189,7 +198,25 @@ export default function Page() {
       },
       {} as Record<SourceCategory, SourceMetadata[]>
     );
-  }, [sources, filterSources, searchTerm]);
+
+    // Filter out the "Other" category if show_extra_connectors is false
+    if (settings?.settings?.show_extra_connectors === false) {
+      const filteredCategories = Object.entries(categories).filter(
+        ([category]) => category !== SourceCategory.Other
+      );
+      return Object.fromEntries(filteredCategories) as Record<
+        SourceCategory,
+        SourceMetadata[]
+      >;
+    }
+
+    return categories;
+  }, [
+    sources,
+    filterSources,
+    searchTerm,
+    settings?.settings?.show_extra_connectors,
+  ]);
 
   const handleKeyPress = (e: React.KeyboardEvent<HTMLInputElement>) => {
     if (e.key === "Enter") {
@@ -251,7 +278,6 @@ export default function Page() {
             <div className="flex mt-8">
               <Title>{category}</Title>
             </div>
-            <p>{getCategoryDescription(category as SourceCategory)}</p>
             <div className="flex flex-wrap gap-4 p-4">
               {sources.map((source, sourceInd) => (
                 <SourceTile
@@ -270,26 +296,3 @@ export default function Page() {
     </div>
   );
 }
-
-function getCategoryDescription(category: SourceCategory): string {
-  switch (category) {
-    case SourceCategory.Messaging:
-      return "Integrate with messaging and communication platforms.";
-    case SourceCategory.ProjectManagement:
-      return "Link to project management and task tracking tools.";
-    case SourceCategory.CustomerSupport:
-      return "Connect to customer support and helpdesk systems.";
-    case SourceCategory.CustomerRelationshipManagement:
-      return "Integrate with customer relationship management platforms.";
-    case SourceCategory.CodeRepository:
-      return "Integrate with code repositories and version control systems.";
-    case SourceCategory.Storage:
-      return "Connect to cloud storage and file hosting services.";
-    case SourceCategory.Wiki:
-      return "Link to wiki and knowledge base platforms.";
-    case SourceCategory.Other:
-      return "Connect to other miscellaneous knowledge sources.";
-    default:
-      return "Connect to various knowledge sources.";
-  }
-}
diff --git a/web/src/app/admin/settings/interfaces.ts b/web/src/app/admin/settings/interfaces.ts
index 40dc588340c..6adfd2cdc79 100644
--- a/web/src/app/admin/settings/interfaces.ts
+++ b/web/src/app/admin/settings/interfaces.ts
@@ -30,6 +30,9 @@ export interface Settings {
 
   // User Knowledge settings
   user_knowledge_enabled?: boolean;
+
+  // Connector settings
+  show_extra_connectors?: boolean;
 }
 
 export enum NotificationType {
diff --git a/web/src/lib/search/interfaces.ts b/web/src/lib/search/interfaces.ts
index 1fde64f8d9e..8f183e0ef4a 100644
--- a/web/src/lib/search/interfaces.ts
+++ b/web/src/lib/search/interfaces.ts
@@ -162,10 +162,10 @@ export interface SearchResponse {
 }
 
 export enum SourceCategory {
-  Storage = "Storage",
-  Wiki = "Wiki",
+  Storage = "Web Crawl & File Storage",
+  Wiki = "Knowledge Base & Wiki",
   CustomerSupport = "Customer Support",
-  CustomerRelationshipManagement = "Customer Relationship Management",
+  SalesAndMarketing = "Sales & Marketing",
   Messaging = "Messaging",
   ProjectManagement = "Project Management",
   CodeRepository = "Code Repository",
diff --git a/web/src/lib/sources.ts b/web/src/lib/sources.ts
index 61c622446bb..59072ec77d2 100644
--- a/web/src/lib/sources.ts
+++ b/web/src/lib/sources.ts
@@ -84,7 +84,7 @@ export const SOURCE_METADATA_MAP: SourceMap = {
   web: {
     icon: GlobeIcon2,
     displayName: "Web",
-    category: SourceCategory.Other,
+    category: SourceCategory.Storage,
     docs: "https://docs.onyx.app/connectors/web",
   },
   file: {
@@ -154,7 +154,7 @@ export const SOURCE_METADATA_MAP: SourceMap = {
   gong: {
     icon: GongIcon,
     displayName: "Gong",
-    category: SourceCategory.Other,
+    category: SourceCategory.SalesAndMarketing,
     docs: "https://docs.onyx.app/connectors/gong",
   },
   linear: {
@@ -190,7 +190,7 @@ export const SOURCE_METADATA_MAP: SourceMap = {
   hubspot: {
     icon: HubSpotIcon,
     displayName: "HubSpot",
-    category: SourceCategory.CustomerRelationshipManagement,
+    category: SourceCategory.SalesAndMarketing,
     docs: "https://docs.onyx.app/connectors/hubspot",
   },
   document360: {
@@ -214,7 +214,7 @@ export const SOURCE_METADATA_MAP: SourceMap = {
   loopio: {
     icon: LoopioIcon,
     displayName: "Loopio",
-    category: SourceCategory.Other,
+    category: SourceCategory.SalesAndMarketing,
   },
   dropbox: {
     icon: DropboxIcon,
@@ -225,7 +225,7 @@ export const SOURCE_METADATA_MAP: SourceMap = {
   salesforce: {
     icon: SalesforceIcon,
     displayName: "Salesforce",
-    category: SourceCategory.CustomerRelationshipManagement,
+    category: SourceCategory.SalesAndMarketing,
     docs: "https://docs.onyx.app/connectors/salesforce",
   },
   sharepoint: {
@@ -319,7 +319,7 @@ export const SOURCE_METADATA_MAP: SourceMap = {
   fireflies: {
     icon: FirefliesIcon,
     displayName: "Fireflies",
-    category: SourceCategory.Other,
+    category: SourceCategory.SalesAndMarketing,
     docs: "https://docs.onyx.app/connectors/fireflies",
   },
   egnyte: {
@@ -331,7 +331,7 @@ export const SOURCE_METADATA_MAP: SourceMap = {
   airtable: {
     icon: AirtableIcon,
     displayName: "Airtable",
-    category: SourceCategory.Other,
+    category: SourceCategory.ProjectManagement,
     docs: "https://docs.onyx.app/connectors/airtable",
   },
   gitbook: {
@@ -351,8 +351,7 @@ export const SOURCE_METADATA_MAP: SourceMap = {
     displayName: "Email",
     category: SourceCategory.Messaging,
   },
-  // currently used for the Internet Search tool docs, which is why
-  // a globe is used
+  // Placeholder used as non-null default
   not_applicable: {
     icon: GlobeIcon,
     displayName: "Not Applicable",

From 10900c517b87b283746e3f0e6df36f15aaf95f56 Mon Sep 17 00:00:00 2001
From: Wenxi Onyx <wenxi@onyx.app>
Date: Thu, 17 Jul 2025 16:21:17 -0700
Subject: [PATCH 03/78] vscode env template

---
 .vscode/env_template.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.vscode/env_template.txt b/.vscode/env_template.txt
index dd5417bbe4d..8e641f8b5cd 100644
--- a/.vscode/env_template.txt
+++ b/.vscode/env_template.txt
@@ -65,3 +65,6 @@ S3_ENDPOINT_URL=http://localhost:9004
 S3_FILE_STORE_BUCKET_NAME=onyx-file-store-bucket
 S3_AWS_ACCESS_KEY_ID=minioadmin
 S3_AWS_SECRET_ACCESS_KEY=minioadmin
+
+# Show extra/uncommon connectors
+SHOW_EXTRA_CONNECTORS=False

From 8cde39a01bd0e332208ce516173bc2a8afbc1f19 Mon Sep 17 00:00:00 2001
From: Wenxi Onyx <wenxi@onyx.app>
Date: Thu, 17 Jul 2025 16:56:52 -0700
Subject: [PATCH 04/78] deployment fix and change default to false

---
 backend/onyx/server/settings/models.py                       | 2 +-
 deployment/docker_compose/docker-compose.dev.yml             | 4 +++-
 deployment/docker_compose/docker-compose.gpu-dev.yml         | 5 ++++-
 deployment/docker_compose/docker-compose.multitenant-dev.yml | 5 ++++-
 4 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/backend/onyx/server/settings/models.py b/backend/onyx/server/settings/models.py
index 90a1f7a7143..450e321afc5 100644
--- a/backend/onyx/server/settings/models.py
+++ b/backend/onyx/server/settings/models.py
@@ -63,7 +63,7 @@ class Settings(BaseModel):
     user_knowledge_enabled: bool | None = True
 
     # Connector settings
-    show_extra_connectors: bool | None = True
+    show_extra_connectors: bool | None = False
 
 
 class UserSettings(Settings):
diff --git a/deployment/docker_compose/docker-compose.dev.yml b/deployment/docker_compose/docker-compose.dev.yml
index 908479af8da..88c131984a8 100644
--- a/deployment/docker_compose/docker-compose.dev.yml
+++ b/deployment/docker_compose/docker-compose.dev.yml
@@ -119,6 +119,9 @@ services:
       # Chat Configs
       - HARD_DELETE_CHATS=${HARD_DELETE_CHATS:-}
 
+      # Enables extra/community-supported connectors
+      - SHOW_EXTRA_CONNECTORS=${SHOW_EXTRA_CONNECTORS:-}
+
       # Enables the use of bedrock models or IAM Auth
       - AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID:-}
       - AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY:-}
@@ -233,7 +236,6 @@ services:
       - GITHUB_CONNECTOR_BASE_URL=${GITHUB_CONNECTOR_BASE_URL:-}
       - MAX_DOCUMENT_CHARS=${MAX_DOCUMENT_CHARS:-}
       - MAX_FILE_SIZE_BYTES=${MAX_FILE_SIZE_BYTES:-}
-      - SHOW_EXTRA_CONNECTORS=${SHOW_EXTRA_CONNECTORS:-}
       # Egnyte OAuth Configs
       - EGNYTE_CLIENT_ID=${EGNYTE_CLIENT_ID:-}
       - EGNYTE_CLIENT_SECRET=${EGNYTE_CLIENT_SECRET:-}
diff --git a/deployment/docker_compose/docker-compose.gpu-dev.yml b/deployment/docker_compose/docker-compose.gpu-dev.yml
index 46f6b8de001..a2b76517370 100644
--- a/deployment/docker_compose/docker-compose.gpu-dev.yml
+++ b/deployment/docker_compose/docker-compose.gpu-dev.yml
@@ -42,6 +42,7 @@ services:
       - DISABLE_GENERATIVE_AI=${DISABLE_GENERATIVE_AI:-}
       - DISABLE_LITELLM_STREAMING=${DISABLE_LITELLM_STREAMING:-}
       - LITELLM_EXTRA_HEADERS=${LITELLM_EXTRA_HEADERS:-}
+      - EXA_API_KEY=${EXA_API_KEY:-}
       # if set, allows for the use of the token budget system
       - TOKEN_BUDGET_GLOBALLY_ENABLED=${TOKEN_BUDGET_GLOBALLY_ENABLED:-}
 
@@ -97,6 +98,9 @@ services:
       # Chat Configs
       - HARD_DELETE_CHATS=${HARD_DELETE_CHATS:-}
 
+      # Enables extra/community-supported connectors
+      - SHOW_EXTRA_CONNECTORS=${SHOW_EXTRA_CONNECTORS:-}
+
       # Vespa Language Forcing
       # See: https://docs.vespa.ai/en/linguistics.html 
       - VESPA_LANGUAGE_OVERRIDE=${VESPA_LANGUAGE_OVERRIDE:-}
@@ -192,7 +196,6 @@ services:
       - GONG_CONNECTOR_START_TIME=${GONG_CONNECTOR_START_TIME:-}
       - NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP=${NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP:-}
       - GITHUB_CONNECTOR_BASE_URL=${GITHUB_CONNECTOR_BASE_URL:-}
-      - SHOW_EXTRA_CONNECTORS=${SHOW_EXTRA_CONNECTORS:-}
       # Onyx SlackBot Configs
       - DANSWER_BOT_DISABLE_DOCS_ONLY_ANSWER=${DANSWER_BOT_DISABLE_DOCS_ONLY_ANSWER:-}
       - DANSWER_BOT_FEEDBACK_VISIBILITY=${DANSWER_BOT_FEEDBACK_VISIBILITY:-}
diff --git a/deployment/docker_compose/docker-compose.multitenant-dev.yml b/deployment/docker_compose/docker-compose.multitenant-dev.yml
index 6dd60186e59..a3b7f4ee4c0 100644
--- a/deployment/docker_compose/docker-compose.multitenant-dev.yml
+++ b/deployment/docker_compose/docker-compose.multitenant-dev.yml
@@ -116,6 +116,10 @@ services:
       # Vespa Language Forcing
       # See: https://docs.vespa.ai/en/linguistics.html 
       - VESPA_LANGUAGE_OVERRIDE=${VESPA_LANGUAGE_OVERRIDE:-}
+
+      # Enables extra/community-supported connectors
+      - SHOW_EXTRA_CONNECTORS=${SHOW_EXTRA_CONNECTORS:-}
+
     extra_hosts:
       - "host.docker.internal:host-gateway"
     logging:
@@ -214,7 +218,6 @@ services:
       - GITHUB_CONNECTOR_BASE_URL=${GITHUB_CONNECTOR_BASE_URL:-}
       - MAX_DOCUMENT_CHARS=${MAX_DOCUMENT_CHARS:-}
       - MAX_FILE_SIZE_BYTES=${MAX_FILE_SIZE_BYTES:-}
-      - SHOW_EXTRA_CONNECTORS=${SHOW_EXTRA_CONNECTORS:-}
       # Egnyte OAuth Configs
       - EGNYTE_CLIENT_ID=${EGNYTE_CLIENT_ID:-}
       - EGNYTE_CLIENT_SECRET=${EGNYTE_CLIENT_SECRET:-}

From 9b92c2f353e3458635d88e12c44328df36652f89 Mon Sep 17 00:00:00 2001
From: Wenxi Onyx <wenxi@onyx.app>
Date: Thu, 17 Jul 2025 16:58:46 -0700
Subject: [PATCH 05/78] greptile nit

---
 web/src/app/admin/add-connector/page.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/web/src/app/admin/add-connector/page.tsx b/web/src/app/admin/add-connector/page.tsx
index 1c75372b273..7abb8b29702 100644
--- a/web/src/app/admin/add-connector/page.tsx
+++ b/web/src/app/admin/add-connector/page.tsx
@@ -200,7 +200,7 @@ export default function Page() {
     );
 
     // Filter out the "Other" category if show_extra_connectors is false
-    if (settings?.settings?.show_extra_connectors === false) {
+    if (settings?.settings?.show_extra_connectors !== true) {
       const filteredCategories = Object.entries(categories).filter(
         ([category]) => category !== SourceCategory.Other
       );

From c1b706e6028d9bfa193575a7bcd6b22b8ae5351b Mon Sep 17 00:00:00 2001
From: Raunak Bhagat <r@rabh.io>
Date: Thu, 17 Jul 2025 15:41:31 -0700
Subject: [PATCH 06/78] fix: Move around group-sync tests (since they require
 docker services to be running) (#5041)

* Move around tests

* Add missing fixtures + change directory structure up some more

* Add env variables
---
 .../pr-external-dependency-unit-tests.yml     |  8 ++
 .../workflows/pr-python-connector-tests.yml   |  2 +-
 backend/tests/daily/conftest.py               | 10 ---
 .../connectors/confluence/conftest.py         |  0
 .../confluence/test_confluence_group_sync.py  | 84 +++++++++----------
 .../test_google_drive_group_sync.py}          |  0
 6 files changed, 51 insertions(+), 53 deletions(-)
 rename backend/tests/{daily => external_dependency_unit}/connectors/confluence/conftest.py (100%)
 rename backend/tests/{daily => external_dependency_unit}/connectors/confluence/test_confluence_group_sync.py (63%)
 rename backend/tests/external_dependency_unit/{external_group_sync/test_external_group_sync.py => connectors/google_drive/test_google_drive_group_sync.py} (100%)

diff --git a/.github/workflows/pr-external-dependency-unit-tests.yml b/.github/workflows/pr-external-dependency-unit-tests.yml
index dd63e2e5a1a..a37f99ec307 100644
--- a/.github/workflows/pr-external-dependency-unit-tests.yml
+++ b/.github/workflows/pr-external-dependency-unit-tests.yml
@@ -13,6 +13,14 @@ env:
   # MinIO
   S3_ENDPOINT_URL: "http://localhost:9004"
 
+  # Confluence
+  CONFLUENCE_TEST_SPACE_URL: ${{ secrets.CONFLUENCE_TEST_SPACE_URL }}
+  CONFLUENCE_TEST_SPACE: ${{ secrets.CONFLUENCE_TEST_SPACE }}
+  CONFLUENCE_TEST_PAGE_ID: ${{ secrets.CONFLUENCE_TEST_PAGE_ID }}
+  CONFLUENCE_IS_CLOUD: ${{ secrets.CONFLUENCE_IS_CLOUD }}
+  CONFLUENCE_USER_NAME: ${{ secrets.CONFLUENCE_USER_NAME }}
+  CONFLUENCE_ACCESS_TOKEN: ${{ secrets.CONFLUENCE_ACCESS_TOKEN }}
+
 jobs:
   discover-test-dirs:
     runs-on: ubuntu-latest
diff --git a/.github/workflows/pr-python-connector-tests.yml b/.github/workflows/pr-python-connector-tests.yml
index aea2f89c473..c4c70ef3914 100644
--- a/.github/workflows/pr-python-connector-tests.yml
+++ b/.github/workflows/pr-python-connector-tests.yml
@@ -16,8 +16,8 @@ env:
   # Confluence
   CONFLUENCE_TEST_SPACE_URL: ${{ secrets.CONFLUENCE_TEST_SPACE_URL }}
   CONFLUENCE_TEST_SPACE: ${{ secrets.CONFLUENCE_TEST_SPACE }}
-  CONFLUENCE_IS_CLOUD: ${{ secrets.CONFLUENCE_IS_CLOUD }}
   CONFLUENCE_TEST_PAGE_ID: ${{ secrets.CONFLUENCE_TEST_PAGE_ID }}
+  CONFLUENCE_IS_CLOUD: ${{ secrets.CONFLUENCE_IS_CLOUD }}
   CONFLUENCE_USER_NAME: ${{ secrets.CONFLUENCE_USER_NAME }}
   CONFLUENCE_ACCESS_TOKEN: ${{ secrets.CONFLUENCE_ACCESS_TOKEN }}
 
diff --git a/backend/tests/daily/conftest.py b/backend/tests/daily/conftest.py
index 000d3b53a7f..4002b6c1180 100644
--- a/backend/tests/daily/conftest.py
+++ b/backend/tests/daily/conftest.py
@@ -6,7 +6,6 @@
 from fastapi import FastAPI
 from fastapi.testclient import TestClient
 
-from onyx.db.engine.sql_engine import SqlEngine
 from onyx.main import fetch_versioned_implementation
 from onyx.utils.logger import setup_logger
 
@@ -24,12 +23,3 @@ def client() -> Generator[TestClient, Any, None]:
     )()
     client = TestClient(app)
     yield client
-
-
-@pytest.fixture(scope="session", autouse=True)
-def initialize_db() -> None:
-    # Make sure that the db engine is initialized before any tests are run
-    SqlEngine.init_engine(
-        pool_size=10,
-        max_overflow=5,
-    )
diff --git a/backend/tests/daily/connectors/confluence/conftest.py b/backend/tests/external_dependency_unit/connectors/confluence/conftest.py
similarity index 100%
rename from backend/tests/daily/connectors/confluence/conftest.py
rename to backend/tests/external_dependency_unit/connectors/confluence/conftest.py
diff --git a/backend/tests/daily/connectors/confluence/test_confluence_group_sync.py b/backend/tests/external_dependency_unit/connectors/confluence/test_confluence_group_sync.py
similarity index 63%
rename from backend/tests/daily/connectors/confluence/test_confluence_group_sync.py
rename to backend/tests/external_dependency_unit/connectors/confluence/test_confluence_group_sync.py
index 32ba600aad7..8017fba545b 100644
--- a/backend/tests/daily/connectors/confluence/test_confluence_group_sync.py
+++ b/backend/tests/external_dependency_unit/connectors/confluence/test_confluence_group_sync.py
@@ -1,9 +1,10 @@
 from typing import Any
 
+from sqlalchemy.orm import Session
+
 from ee.onyx.external_permissions.confluence.group_sync import confluence_group_sync
 from onyx.configs.constants import DocumentSource
 from onyx.connectors.models import InputType
-from onyx.db.engine.sql_engine import get_session_with_current_tenant
 from onyx.db.enums import AccessType
 from onyx.db.enums import ConnectorCredentialPairStatus
 from onyx.db.models import Connector
@@ -83,51 +84,50 @@
 
 
 def test_confluence_group_sync(
-    initialize_db: None,
+    db_session: Session,
     confluence_connector_config: dict[str, Any],
     confluence_credential_json: dict[str, Any],
 ) -> None:
-    with get_session_with_current_tenant() as db_session:
-        connector = Connector(
-            name="Test Connector",
-            source=DocumentSource.CONFLUENCE,
-            input_type=InputType.POLL,
-            connector_specific_config=confluence_connector_config,
-            refresh_freq=None,
-            prune_freq=None,
-            indexing_start=None,
-        )
-        db_session.add(connector)
-        db_session.flush()
+    connector = Connector(
+        name="Test Connector",
+        source=DocumentSource.CONFLUENCE,
+        input_type=InputType.POLL,
+        connector_specific_config=confluence_connector_config,
+        refresh_freq=None,
+        prune_freq=None,
+        indexing_start=None,
+    )
+    db_session.add(connector)
+    db_session.flush()
 
-        credential = Credential(
-            source=DocumentSource.CONFLUENCE,
-            credential_json=confluence_credential_json,
-        )
-        db_session.add(credential)
-        db_session.flush()
+    credential = Credential(
+        source=DocumentSource.CONFLUENCE,
+        credential_json=confluence_credential_json,
+    )
+    db_session.add(credential)
+    db_session.flush()
 
-        cc_pair = ConnectorCredentialPair(
-            connector_id=connector.id,
-            credential_id=credential.id,
-            name="Test CC Pair",
-            status=ConnectorCredentialPairStatus.ACTIVE,
-            access_type=AccessType.SYNC,
-            auto_sync_options=None,
-        )
-        db_session.add(cc_pair)
-        db_session.commit()
-        db_session.refresh(cc_pair)
+    cc_pair = ConnectorCredentialPair(
+        connector_id=connector.id,
+        credential_id=credential.id,
+        name="Test CC Pair",
+        status=ConnectorCredentialPairStatus.ACTIVE,
+        access_type=AccessType.SYNC,
+        auto_sync_options=None,
+    )
+    db_session.add(cc_pair)
+    db_session.commit()
+    db_session.refresh(cc_pair)
 
-        tenant_id = get_current_tenant_id()
-        group_sync_iter = confluence_group_sync(
-            tenant_id=tenant_id,
-            cc_pair=cc_pair,
-        )
+    tenant_id = get_current_tenant_id()
+    group_sync_iter = confluence_group_sync(
+        tenant_id=tenant_id,
+        cc_pair=cc_pair,
+    )
 
-        expected_groups = {group.id: group for group in _EXPECTED_CONFLUENCE_GROUPS}
-        actual_groups = {
-            group.id: ExternalUserGroupSet.from_model(external_user_group=group)
-            for group in group_sync_iter
-        }
-        assert expected_groups == actual_groups
+    expected_groups = {group.id: group for group in _EXPECTED_CONFLUENCE_GROUPS}
+    actual_groups = {
+        group.id: ExternalUserGroupSet.from_model(external_user_group=group)
+        for group in group_sync_iter
+    }
+    assert expected_groups == actual_groups
diff --git a/backend/tests/external_dependency_unit/external_group_sync/test_external_group_sync.py b/backend/tests/external_dependency_unit/connectors/google_drive/test_google_drive_group_sync.py
similarity index 100%
rename from backend/tests/external_dependency_unit/external_group_sync/test_external_group_sync.py
rename to backend/tests/external_dependency_unit/connectors/google_drive/test_google_drive_group_sync.py

From 8edcb69ad234a5e0b9c1c368ef487660f52b1c46 Mon Sep 17 00:00:00 2001
From: Wenxi <wenxi@onyx.app>
Date: Thu, 17 Jul 2025 16:23:46 -0700
Subject: [PATCH 07/78] remove chat session necessity from send message simple
 api (#5040)

---
 .../server/query_and_chat/chat_backend.py     | 37 ++++++++++++++++---
 .../ee/onyx/server/query_and_chat/models.py   | 15 +++++++-
 2 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/backend/ee/onyx/server/query_and_chat/chat_backend.py b/backend/ee/onyx/server/query_and_chat/chat_backend.py
index 915564d69c3..2e30cf0be37 100644
--- a/backend/ee/onyx/server/query_and_chat/chat_backend.py
+++ b/backend/ee/onyx/server/query_and_chat/chat_backend.py
@@ -1,5 +1,6 @@
 import re
 from typing import cast
+from uuid import UUID
 
 from fastapi import APIRouter
 from fastapi import Depends
@@ -73,6 +74,7 @@ def _get_final_context_doc_indices(
 
 def _convert_packet_stream_to_response(
     packets: ChatPacketStream,
+    chat_session_id: UUID,
 ) -> ChatBasicResponse:
     response = ChatBasicResponse()
     final_context_docs: list[LlmDoc] = []
@@ -216,6 +218,8 @@ def _convert_packet_stream_to_response(
     if answer:
         response.answer_citationless = remove_answer_citations(answer)
 
+    response.chat_session_id = chat_session_id
+
     return response
 
 
@@ -237,13 +241,36 @@ def handle_simplified_chat_message(
     if not chat_message_req.message:
         raise HTTPException(status_code=400, detail="Empty chat message is invalid")
 
+    # Handle chat session creation if chat_session_id is not provided
+    if chat_message_req.chat_session_id is None:
+        if chat_message_req.persona_id is None:
+            raise HTTPException(
+                status_code=400,
+                detail="Either chat_session_id or persona_id must be provided",
+            )
+
+        # Create a new chat session with the provided persona_id
+        try:
+            new_chat_session = create_chat_session(
+                db_session=db_session,
+                description="",  # Leave empty for simple API
+                user_id=user.id if user else None,
+                persona_id=chat_message_req.persona_id,
+            )
+            chat_session_id = new_chat_session.id
+        except Exception as e:
+            logger.exception(e)
+            raise HTTPException(status_code=400, detail="Invalid Persona provided.")
+    else:
+        chat_session_id = chat_message_req.chat_session_id
+
     try:
         parent_message, _ = create_chat_chain(
-            chat_session_id=chat_message_req.chat_session_id, db_session=db_session
+            chat_session_id=chat_session_id, db_session=db_session
         )
     except Exception:
         parent_message = get_or_create_root_message(
-            chat_session_id=chat_message_req.chat_session_id, db_session=db_session
+            chat_session_id=chat_session_id, db_session=db_session
         )
 
     if (
@@ -258,7 +285,7 @@ def handle_simplified_chat_message(
         retrieval_options = chat_message_req.retrieval_options
 
     full_chat_msg_info = CreateChatMessageRequest(
-        chat_session_id=chat_message_req.chat_session_id,
+        chat_session_id=chat_session_id,
         parent_message_id=parent_message.id,
         message=chat_message_req.message,
         file_descriptors=[],
@@ -283,7 +310,7 @@ def handle_simplified_chat_message(
         enforce_chat_session_id_for_search_docs=False,
     )
 
-    return _convert_packet_stream_to_response(packets)
+    return _convert_packet_stream_to_response(packets, chat_session_id)
 
 
 @router.post("/send-message-simple-with-history")
@@ -403,4 +430,4 @@ def handle_send_message_simple_with_history(
         enforce_chat_session_id_for_search_docs=False,
     )
 
-    return _convert_packet_stream_to_response(packets)
+    return _convert_packet_stream_to_response(packets, chat_session.id)
diff --git a/backend/ee/onyx/server/query_and_chat/models.py b/backend/ee/onyx/server/query_and_chat/models.py
index d674e9ecb51..9a97c729f35 100644
--- a/backend/ee/onyx/server/query_and_chat/models.py
+++ b/backend/ee/onyx/server/query_and_chat/models.py
@@ -41,11 +41,13 @@ class DocumentSearchRequest(ChunkContext):
 
 
 class BasicCreateChatMessageRequest(ChunkContext):
-    """Before creating messages, be sure to create a chat_session and get an id
+    """If a chat_session_id is not provided, a persona_id must be provided to automatically create a new chat session
     Note, for simplicity this option only allows for a single linear chain of messages
     """
 
-    chat_session_id: UUID
+    chat_session_id: UUID | None = None
+    # Optional persona_id to create a new chat session if chat_session_id is not provided
+    persona_id: int | None = None
     # New message contents
     message: str
     # Defaults to using retrieval with no additional filters
@@ -62,6 +64,12 @@ class BasicCreateChatMessageRequest(ChunkContext):
     # If True, uses agentic search instead of basic search
     use_agentic_search: bool = False
 
+    @model_validator(mode="after")
+    def validate_chat_session_or_persona(self) -> "BasicCreateChatMessageRequest":
+        if self.chat_session_id is None and self.persona_id is None:
+            raise ValueError("Either chat_session_id or persona_id must be provided")
+        return self
+
 
 class BasicCreateChatMessageWithHistoryRequest(ChunkContext):
     # Last element is the new query. All previous elements are historical context
@@ -171,6 +179,9 @@ class ChatBasicResponse(BaseModel):
     agent_sub_queries: dict[int, dict[int, list[AgentSubQuery]]] | None = None
     agent_refined_answer_improvement: bool | None = None
 
+    # Chat session ID for tracking conversation continuity
+    chat_session_id: UUID | None = None
+
 
 class OneShotQARequest(ChunkContext):
     # Supports simplier APIs that don't deal with chat histories or message edits

From 28d5a63a1c84115f612b327cba5bf5c520b5b1ce Mon Sep 17 00:00:00 2001
From: Chris Weaver <chris@onyx.app>
Date: Thu, 17 Jul 2025 23:51:39 -0700
Subject: [PATCH 08/78] Improve support for non-default postgres schemas
 (#5046)

---
 backend/alembic/env.py                                    | 4 ++--
 .../versions/36e9220ab794_update_kg_trigger_functions.py  | 6 +++---
 .../versions/495cb26ce93e_create_knowlege_graph_tables.py | 8 ++++----
 backend/onyx/db/engine/async_sql_engine.py                | 3 ++-
 backend/onyx/db/engine/sql_engine.py                      | 6 +++---
 backend/onyx/kg/clustering/clustering.py                  | 4 ++--
 backend/onyx/kg/clustering/normalizations.py              | 4 ++--
 backend/shared_configs/configs.py                         | 2 ++
 8 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/backend/alembic/env.py b/backend/alembic/env.py
index d33d5c37a37..24ca51a89a5 100644
--- a/backend/alembic/env.py
+++ b/backend/alembic/env.py
@@ -23,7 +23,7 @@
 from onyx.configs.constants import SSL_CERT_FILE
 from shared_configs.configs import (
     MULTI_TENANT,
-    POSTGRES_DEFAULT_SCHEMA_STANDARD_VALUE,
+    POSTGRES_DEFAULT_SCHEMA,
     TENANT_ID_PREFIX,
 )
 from onyx.db.models import Base
@@ -271,7 +271,7 @@ async def run_async_migrations() -> None:
     ) = get_schema_options()
 
     if not schemas and not MULTI_TENANT:
-        schemas = [POSTGRES_DEFAULT_SCHEMA_STANDARD_VALUE]
+        schemas = [POSTGRES_DEFAULT_SCHEMA]
 
     # without init_engine, subsequent engine calls fail hard intentionally
     SqlEngine.init_engine(pool_size=20, max_overflow=5)
diff --git a/backend/alembic/versions/36e9220ab794_update_kg_trigger_functions.py b/backend/alembic/versions/36e9220ab794_update_kg_trigger_functions.py
index bde421fcb73..7c9e25fb179 100644
--- a/backend/alembic/versions/36e9220ab794_update_kg_trigger_functions.py
+++ b/backend/alembic/versions/36e9220ab794_update_kg_trigger_functions.py
@@ -9,7 +9,7 @@
 from alembic import op
 from sqlalchemy.orm import Session
 from sqlalchemy import text
-from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA_STANDARD_VALUE
+from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
 
 # revision identifiers, used by Alembic.
 revision = "36e9220ab794"
@@ -66,7 +66,7 @@ def upgrade() -> None:
 
                 -- Set name and name trigrams
                 NEW.name = name;
-                NEW.name_trigrams = {POSTGRES_DEFAULT_SCHEMA_STANDARD_VALUE}.show_trgm(cleaned_name);
+                NEW.name_trigrams = {POSTGRES_DEFAULT_SCHEMA}.show_trgm(cleaned_name);
                 RETURN NEW;
             END;
             $$ LANGUAGE plpgsql;
@@ -111,7 +111,7 @@ def upgrade() -> None:
                 UPDATE "{tenant_id}".kg_entity
                 SET
                     name = doc_name,
-                    name_trigrams = {POSTGRES_DEFAULT_SCHEMA_STANDARD_VALUE}.show_trgm(cleaned_name)
+                    name_trigrams = {POSTGRES_DEFAULT_SCHEMA}.show_trgm(cleaned_name)
                 WHERE document_id = NEW.id;
                 RETURN NEW;
             END;
diff --git a/backend/alembic/versions/495cb26ce93e_create_knowlege_graph_tables.py b/backend/alembic/versions/495cb26ce93e_create_knowlege_graph_tables.py
index 65cf759d6f3..f1cbf003359 100644
--- a/backend/alembic/versions/495cb26ce93e_create_knowlege_graph_tables.py
+++ b/backend/alembic/versions/495cb26ce93e_create_knowlege_graph_tables.py
@@ -15,7 +15,7 @@
 from onyx.configs.app_configs import DB_READONLY_USER
 from onyx.configs.app_configs import DB_READONLY_PASSWORD
 from shared_configs.configs import MULTI_TENANT
-from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA_STANDARD_VALUE
+from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
 
 
 # revision identifiers, used by Alembic.
@@ -478,7 +478,7 @@ def upgrade() -> None:
     # Create GIN index for clustering and normalization
     op.execute(
         "CREATE INDEX IF NOT EXISTS idx_kg_entity_clustering_trigrams "
-        f"ON kg_entity USING GIN (name {POSTGRES_DEFAULT_SCHEMA_STANDARD_VALUE}.gin_trgm_ops)"
+        f"ON kg_entity USING GIN (name {POSTGRES_DEFAULT_SCHEMA}.gin_trgm_ops)"
     )
     op.execute(
         "CREATE INDEX IF NOT EXISTS idx_kg_entity_normalization_trigrams "
@@ -518,7 +518,7 @@ def upgrade() -> None:
 
                 -- Set name and name trigrams
                 NEW.name = name;
-                NEW.name_trigrams = {POSTGRES_DEFAULT_SCHEMA_STANDARD_VALUE}.show_trgm(cleaned_name);
+                NEW.name_trigrams = {POSTGRES_DEFAULT_SCHEMA}.show_trgm(cleaned_name);
                 RETURN NEW;
             END;
             $$ LANGUAGE plpgsql;
@@ -563,7 +563,7 @@ def upgrade() -> None:
                 UPDATE kg_entity
                 SET
                     name = doc_name,
-                    name_trigrams = {POSTGRES_DEFAULT_SCHEMA_STANDARD_VALUE}.show_trgm(cleaned_name)
+                    name_trigrams = {POSTGRES_DEFAULT_SCHEMA}.show_trgm(cleaned_name)
                 WHERE document_id = NEW.id;
                 RETURN NEW;
             END;
diff --git a/backend/onyx/db/engine/async_sql_engine.py b/backend/onyx/db/engine/async_sql_engine.py
index a871dbad1ed..0bce3561899 100644
--- a/backend/onyx/db/engine/async_sql_engine.py
+++ b/backend/onyx/db/engine/async_sql_engine.py
@@ -29,6 +29,7 @@
 from onyx.db.engine.sql_engine import SqlEngine
 from onyx.db.engine.sql_engine import USE_IAM_AUTH
 from shared_configs.configs import MULTI_TENANT
+from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA_STANDARD_VALUE
 from shared_configs.contextvars import get_current_tenant_id
 
 
@@ -118,7 +119,7 @@ async def get_async_session(
     engine = get_sqlalchemy_async_engine()
 
     # no need to use the schema translation map for self-hosted + default schema
-    if not MULTI_TENANT:
+    if not MULTI_TENANT and tenant_id == POSTGRES_DEFAULT_SCHEMA_STANDARD_VALUE:
         async with AsyncSession(bind=engine, expire_on_commit=False) as session:
             yield session
         return
diff --git a/backend/onyx/db/engine/sql_engine.py b/backend/onyx/db/engine/sql_engine.py
index beac099265e..459afb9d849 100644
--- a/backend/onyx/db/engine/sql_engine.py
+++ b/backend/onyx/db/engine/sql_engine.py
@@ -31,6 +31,7 @@
 from onyx.utils.logger import setup_logger
 from shared_configs.configs import MULTI_TENANT
 from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
+from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA_STANDARD_VALUE
 from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR
 from shared_configs.contextvars import get_current_tenant_id
 
@@ -324,7 +325,7 @@ def get_session_with_tenant(*, tenant_id: str) -> Generator[Session, None, None]
         raise HTTPException(status_code=400, detail="Invalid tenant ID")
 
     # no need to use the schema translation map for self-hosted + default schema
-    if not MULTI_TENANT:
+    if not MULTI_TENANT and tenant_id == POSTGRES_DEFAULT_SCHEMA_STANDARD_VALUE:
         with Session(bind=engine, expire_on_commit=False) as session:
             yield session
         return
@@ -370,12 +371,11 @@ def get_db_readonly_user_session_with_current_tenant() -> (
         raise HTTPException(status_code=400, detail="Invalid tenant ID")
 
     # no need to use the schema translation map for self-hosted + default schema
-    if not MULTI_TENANT:
+    if not MULTI_TENANT and tenant_id == POSTGRES_DEFAULT_SCHEMA_STANDARD_VALUE:
         with Session(readonly_engine, expire_on_commit=False) as session:
             yield session
         return
 
-    # no need to use the schema translation map for self-hosted + default schema
     schema_translate_map = {None: tenant_id}
     with readonly_engine.connect().execution_options(
         schema_translate_map=schema_translate_map
diff --git a/backend/onyx/kg/clustering/clustering.py b/backend/onyx/kg/clustering/clustering.py
index 12ca5ffa023..7012e01d23b 100644
--- a/backend/onyx/kg/clustering/clustering.py
+++ b/backend/onyx/kg/clustering/clustering.py
@@ -34,7 +34,7 @@
 from onyx.kg.utils.formatting_utils import make_relationship_id
 from onyx.utils.logger import setup_logger
 from onyx.utils.threadpool_concurrency import run_functions_tuples_in_parallel
-from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA_STANDARD_VALUE
+from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
 
 logger = setup_logger()
 
@@ -180,7 +180,7 @@ def _cluster_one_grounded_entity(
                     # find entities of the same type with a similar name
                     *filtering,
                     KGEntity.entity_type_id_name == entity.entity_type_id_name,
-                    getattr(func, POSTGRES_DEFAULT_SCHEMA_STANDARD_VALUE).similarity_op(
+                    getattr(func, POSTGRES_DEFAULT_SCHEMA).similarity_op(
                         KGEntity.name, entity_name
                     ),
                 )
diff --git a/backend/onyx/kg/clustering/normalizations.py b/backend/onyx/kg/clustering/normalizations.py
index 3f272021145..3a611cec7ad 100644
--- a/backend/onyx/kg/clustering/normalizations.py
+++ b/backend/onyx/kg/clustering/normalizations.py
@@ -33,7 +33,7 @@
 from onyx.kg.utils.formatting_utils import split_relationship_id
 from onyx.utils.logger import setup_logger
 from onyx.utils.threadpool_concurrency import run_functions_tuples_in_parallel
-from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA_STANDARD_VALUE
+from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
 
 logger = setup_logger()
 
@@ -95,7 +95,7 @@ def _normalize_one_entity(
 
         # generate trigrams of the queried entity Q
         query_trigrams = db_session.query(
-            getattr(func, POSTGRES_DEFAULT_SCHEMA_STANDARD_VALUE)
+            getattr(func, POSTGRES_DEFAULT_SCHEMA)
             .show_trgm(cleaned_entity)
             .cast(ARRAY(String(3)))
             .label("trigrams")
diff --git a/backend/shared_configs/configs.py b/backend/shared_configs/configs.py
index 12d12a49697..a21e890e360 100644
--- a/backend/shared_configs/configs.py
+++ b/backend/shared_configs/configs.py
@@ -140,6 +140,8 @@ def validate_cors_origin(origin: str) -> None:
 # Multi-tenancy configuration
 MULTI_TENANT = os.environ.get("MULTI_TENANT", "").lower() == "true"
 
+# Outside this file, should almost always use `POSTGRES_DEFAULT_SCHEMA` unless you
+# have a very good reason
 POSTGRES_DEFAULT_SCHEMA_STANDARD_VALUE = "public"
 POSTGRES_DEFAULT_SCHEMA = (
     os.environ.get("POSTGRES_DEFAULT_SCHEMA") or POSTGRES_DEFAULT_SCHEMA_STANDARD_VALUE

From 524eb1e8b7e92e8d942cb73ac3f8ed9ac47871d3 Mon Sep 17 00:00:00 2001
From: Chris Weaver <chris@onyx.app>
Date: Thu, 17 Jul 2025 23:52:51 -0700
Subject: [PATCH 09/78] fix: improve check for indexing status (#5042)

* Improve check_for_indexing + check_for_vespa_sync_task

* Remove unused

* Fix

* Simplify query

* Add more logging

* Address bot comments

* Increase # of tasks generated since we're not going cc-pair by cc-pair

* Only index 50 user files at a time
---
 .../onyx/background/celery/apps/app_base.py   |  10 +-
 .../onyx/background/celery/apps/primary.py    |   9 +-
 .../background/celery/tasks/indexing/tasks.py |  39 +++-
 .../celery/tasks/vespa/document_sync.py       | 178 +++++++++++++++
 .../background/celery/tasks/vespa/tasks.py    | 109 ++-------
 backend/onyx/configs/app_configs.py           |   2 +-
 backend/onyx/db/connector_credential_pair.py  |  30 ++-
 backend/onyx/db/document.py                   |  71 +-----
 backend/onyx/db/enums.py                      |  14 +-
 .../redis/redis_connector_credential_pair.py  | 207 ------------------
 backend/onyx/redis/redis_utils.py             |   5 -
 11 files changed, 282 insertions(+), 392 deletions(-)
 create mode 100644 backend/onyx/background/celery/tasks/vespa/document_sync.py
 delete mode 100644 backend/onyx/redis/redis_connector_credential_pair.py

diff --git a/backend/onyx/background/celery/apps/app_base.py b/backend/onyx/background/celery/apps/app_base.py
index e4619ce3c11..59ecf1cba59 100644
--- a/backend/onyx/background/celery/apps/app_base.py
+++ b/backend/onyx/background/celery/apps/app_base.py
@@ -24,13 +24,14 @@
 from onyx.background.celery.apps.task_formatters import CeleryTaskPlainFormatter
 from onyx.background.celery.celery_utils import celery_is_worker_primary
 from onyx.background.celery.celery_utils import make_probe_path
+from onyx.background.celery.tasks.vespa.document_sync import DOCUMENT_SYNC_PREFIX
+from onyx.background.celery.tasks.vespa.document_sync import DOCUMENT_SYNC_TASKSET_KEY
 from onyx.configs.constants import ONYX_CLOUD_CELERY_TASK_PREFIX
 from onyx.configs.constants import OnyxRedisLocks
 from onyx.db.engine.sql_engine import get_sqlalchemy_engine
 from onyx.document_index.vespa.shared_utils.utils import wait_for_vespa_with_timeout
 from onyx.httpx.httpx_pool import HttpxPool
 from onyx.redis.redis_connector import RedisConnector
-from onyx.redis.redis_connector_credential_pair import RedisConnectorCredentialPair
 from onyx.redis.redis_connector_delete import RedisConnectorDelete
 from onyx.redis.redis_connector_doc_perm_sync import RedisConnectorPermissionSync
 from onyx.redis.redis_connector_ext_group_sync import RedisConnectorExternalGroupSync
@@ -145,8 +146,11 @@ def on_task_postrun(
 
     r = get_redis_client(tenant_id=tenant_id)
 
-    if task_id.startswith(RedisConnectorCredentialPair.PREFIX):
-        r.srem(RedisConnectorCredentialPair.get_taskset_key(), task_id)
+    # NOTE: we want to remove the `Redis*` classes, prefer to just have functions to
+    # do these things going forward. In short, things should generally be like the doc
+    # sync task rather than the others below
+    if task_id.startswith(DOCUMENT_SYNC_PREFIX):
+        r.srem(DOCUMENT_SYNC_TASKSET_KEY, task_id)
         return
 
     if task_id.startswith(RedisDocumentSet.PREFIX):
diff --git a/backend/onyx/background/celery/apps/primary.py b/backend/onyx/background/celery/apps/primary.py
index 298918f0ef2..e63546a7488 100644
--- a/backend/onyx/background/celery/apps/primary.py
+++ b/backend/onyx/background/celery/apps/primary.py
@@ -21,6 +21,7 @@
 from onyx.background.celery.tasks.indexing.utils import (
     get_unfenced_index_attempt_ids,
 )
+from onyx.background.celery.tasks.vespa.document_sync import reset_document_sync
 from onyx.configs.constants import CELERY_PRIMARY_WORKER_LOCK_TIMEOUT
 from onyx.configs.constants import OnyxRedisConstants
 from onyx.configs.constants import OnyxRedisLocks
@@ -29,9 +30,6 @@
 from onyx.db.engine.sql_engine import SqlEngine
 from onyx.db.index_attempt import get_index_attempt
 from onyx.db.index_attempt import mark_attempt_canceled
-from onyx.redis.redis_connector_credential_pair import (
-    RedisGlobalConnectorCredentialPair,
-)
 from onyx.redis.redis_connector_delete import RedisConnectorDelete
 from onyx.redis.redis_connector_doc_perm_sync import RedisConnectorPermissionSync
 from onyx.redis.redis_connector_ext_group_sync import RedisConnectorExternalGroupSync
@@ -156,7 +154,10 @@ def on_worker_init(sender: Worker, **kwargs: Any) -> None:
 
     r.delete(OnyxRedisConstants.ACTIVE_FENCES)
 
-    RedisGlobalConnectorCredentialPair.reset_all(r)
+    # NOTE: we want to remove the `Redis*` classes, prefer to just have functions
+    # This is the preferred way to do this going forward
+    reset_document_sync(r)
+
     RedisDocumentSet.reset_all(r)
     RedisUserGroup.reset_all(r)
     RedisConnectorDelete.reset_all(r)
diff --git a/backend/onyx/background/celery/tasks/indexing/tasks.py b/backend/onyx/background/celery/tasks/indexing/tasks.py
index b07a0a2133e..c911271f154 100644
--- a/backend/onyx/background/celery/tasks/indexing/tasks.py
+++ b/backend/onyx/background/celery/tasks/indexing/tasks.py
@@ -54,7 +54,10 @@
 from onyx.configs.constants import OnyxRedisSignals
 from onyx.connectors.exceptions import ConnectorValidationError
 from onyx.db.connector import mark_ccpair_with_indexing_trigger
-from onyx.db.connector_credential_pair import fetch_connector_credential_pairs
+from onyx.db.connector_credential_pair import ConnectorType
+from onyx.db.connector_credential_pair import (
+    fetch_indexable_connector_credential_pair_ids,
+)
 from onyx.db.connector_credential_pair import get_connector_credential_pair_from_id
 from onyx.db.connector_credential_pair import set_cc_pair_repeated_error_state
 from onyx.db.engine.sql_engine import get_session_with_current_tenant
@@ -86,6 +89,8 @@
 
 logger = setup_logger()
 
+USER_FILE_INDEXING_LIMIT = 100
+
 
 def _get_fence_validation_block_expiration() -> int:
     """
@@ -480,20 +485,37 @@ def check_for_indexing(self: Task, *, tenant_id: str) -> int | None:
                         embedding_model=embedding_model,
                     )
 
-        # gather cc_pair_ids
+        # gather cc_pair_ids + current search settings
         lock_beat.reacquire()
-        cc_pair_ids: list[int] = []
         with get_session_with_current_tenant() as db_session:
-            cc_pairs = fetch_connector_credential_pairs(
-                db_session, include_user_files=True
+            standard_cc_pair_ids = fetch_indexable_connector_credential_pair_ids(
+                db_session, connector_type=ConnectorType.STANDARD
+            )
+            # only index 50 user files at a time. This makes sense since user files are
+            # indexed only once, and then they are done. In practice, we would rarely
+            # have more than `USER_FILE_INDEXING_LIMIT` user files to index.
+            user_file_cc_pair_ids = fetch_indexable_connector_credential_pair_ids(
+                db_session,
+                connector_type=ConnectorType.USER_FILE,
+                limit=USER_FILE_INDEXING_LIMIT,
             )
-            for cc_pair_entry in cc_pairs:
-                cc_pair_ids.append(cc_pair_entry.id)
+            cc_pair_ids = standard_cc_pair_ids + user_file_cc_pair_ids
+
+            # NOTE: some potential race conditions here, but the worse case is
+            # kicking off some "invalid" indexing tasks which will just fail
+            search_settings_list = get_active_search_settings_list(db_session)
+
+        current_search_settings = next(
+            search_settings_instance
+            for search_settings_instance in search_settings_list
+            if search_settings_instance.status.is_current()
+        )
 
         # mark CC Pairs that are repeatedly failing as in repeated error state
         with get_session_with_current_tenant() as db_session:
-            current_search_settings = get_current_search_settings(db_session)
             for cc_pair_id in cc_pair_ids:
+                lock_beat.reacquire()
+
                 if is_in_repeated_error_state(
                     cc_pair_id=cc_pair_id,
                     search_settings_id=current_search_settings.id,
@@ -511,7 +533,6 @@ def check_for_indexing(self: Task, *, tenant_id: str) -> int | None:
 
             redis_connector = RedisConnector(tenant_id, cc_pair_id)
             with get_session_with_current_tenant() as db_session:
-                search_settings_list = get_active_search_settings_list(db_session)
                 for search_settings_instance in search_settings_list:
                     # skip non-live search settings that don't have background reindex enabled
                     # those should just auto-change to live shortly after creation without
diff --git a/backend/onyx/background/celery/tasks/vespa/document_sync.py b/backend/onyx/background/celery/tasks/vespa/document_sync.py
new file mode 100644
index 00000000000..489d127a830
--- /dev/null
+++ b/backend/onyx/background/celery/tasks/vespa/document_sync.py
@@ -0,0 +1,178 @@
+import time
+from typing import cast
+from uuid import uuid4
+
+from celery import Celery
+from redis import Redis
+from redis.lock import Lock as RedisLock
+from sqlalchemy.orm import Session
+
+from onyx.configs.app_configs import DB_YIELD_PER_DEFAULT
+from onyx.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT
+from onyx.configs.constants import OnyxCeleryPriority
+from onyx.configs.constants import OnyxCeleryQueues
+from onyx.configs.constants import OnyxCeleryTask
+from onyx.configs.constants import OnyxRedisConstants
+from onyx.db.document import construct_document_id_select_by_needs_sync
+from onyx.db.document import count_documents_by_needs_sync
+from onyx.utils.logger import setup_logger
+
+# Redis keys for document sync tracking
+DOCUMENT_SYNC_PREFIX = "documentsync"
+DOCUMENT_SYNC_FENCE_KEY = f"{DOCUMENT_SYNC_PREFIX}_fence"
+DOCUMENT_SYNC_TASKSET_KEY = f"{DOCUMENT_SYNC_PREFIX}_taskset"
+
+logger = setup_logger()
+
+
+def is_document_sync_fenced(r: Redis) -> bool:
+    """Check if document sync tasks are currently in progress."""
+    return bool(r.exists(DOCUMENT_SYNC_FENCE_KEY))
+
+
+def get_document_sync_payload(r: Redis) -> int | None:
+    """Get the initial number of tasks that were created."""
+    bytes_result = r.get(DOCUMENT_SYNC_FENCE_KEY)
+    if bytes_result is None:
+        return None
+    return int(cast(int, bytes_result))
+
+
+def get_document_sync_remaining(r: Redis) -> int:
+    """Get the number of tasks still pending completion."""
+    return cast(int, r.scard(DOCUMENT_SYNC_TASKSET_KEY))
+
+
+def set_document_sync_fence(r: Redis, payload: int | None) -> None:
+    """Set up the fence and register with active fences."""
+    if payload is None:
+        r.srem(OnyxRedisConstants.ACTIVE_FENCES, DOCUMENT_SYNC_FENCE_KEY)
+        r.delete(DOCUMENT_SYNC_FENCE_KEY)
+        return
+
+    r.set(DOCUMENT_SYNC_FENCE_KEY, payload)
+    r.sadd(OnyxRedisConstants.ACTIVE_FENCES, DOCUMENT_SYNC_FENCE_KEY)
+
+
+def delete_document_sync_taskset(r: Redis) -> None:
+    """Clear the document sync taskset."""
+    r.delete(DOCUMENT_SYNC_TASKSET_KEY)
+
+
+def reset_document_sync(r: Redis) -> None:
+    """Reset all document sync tracking data."""
+    r.srem(OnyxRedisConstants.ACTIVE_FENCES, DOCUMENT_SYNC_FENCE_KEY)
+    r.delete(DOCUMENT_SYNC_TASKSET_KEY)
+    r.delete(DOCUMENT_SYNC_FENCE_KEY)
+
+
+def generate_document_sync_tasks(
+    r: Redis,
+    max_tasks: int,
+    celery_app: Celery,
+    db_session: Session,
+    lock: RedisLock,
+    tenant_id: str,
+) -> tuple[int, int]:
+    """Generate sync tasks for all documents that need syncing.
+
+    Args:
+        r: Redis client
+        max_tasks: Maximum number of tasks to generate
+        celery_app: Celery application instance
+        db_session: Database session
+        lock: Redis lock for coordination
+        tenant_id: Tenant identifier
+
+    Returns:
+        tuple[int, int]: (tasks_generated, total_docs_found)
+    """
+    last_lock_time = time.monotonic()
+    num_tasks_sent = 0
+    num_docs = 0
+
+    # Get all documents that need syncing
+    stmt = construct_document_id_select_by_needs_sync()
+
+    for doc_id in db_session.scalars(stmt).yield_per(DB_YIELD_PER_DEFAULT):
+        doc_id = cast(str, doc_id)
+        current_time = time.monotonic()
+
+        # Reacquire lock periodically to prevent timeout
+        if current_time - last_lock_time >= (CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT / 4):
+            lock.reacquire()
+            last_lock_time = current_time
+
+        num_docs += 1
+
+        # Create a unique task ID
+        custom_task_id = f"{DOCUMENT_SYNC_PREFIX}_{uuid4()}"
+
+        # Add to the tracking taskset in Redis BEFORE creating the celery task
+        r.sadd(DOCUMENT_SYNC_TASKSET_KEY, custom_task_id)
+
+        # Create the Celery task
+        celery_app.send_task(
+            OnyxCeleryTask.VESPA_METADATA_SYNC_TASK,
+            kwargs=dict(document_id=doc_id, tenant_id=tenant_id),
+            queue=OnyxCeleryQueues.VESPA_METADATA_SYNC,
+            task_id=custom_task_id,
+            priority=OnyxCeleryPriority.MEDIUM,
+            ignore_result=True,
+        )
+
+        num_tasks_sent += 1
+
+        if num_tasks_sent >= max_tasks:
+            break
+
+    return num_tasks_sent, num_docs
+
+
+def try_generate_stale_document_sync_tasks(
+    celery_app: Celery,
+    max_tasks: int,
+    db_session: Session,
+    r: Redis,
+    lock_beat: RedisLock,
+    tenant_id: str,
+) -> int | None:
+    # the fence is up, do nothing
+    if is_document_sync_fenced(r):
+        return None
+
+    # add tasks to celery and build up the task set to monitor in redis
+    stale_doc_count = count_documents_by_needs_sync(db_session)
+    if stale_doc_count == 0:
+        logger.info("No stale documents found. Skipping sync tasks generation.")
+        return None
+
+    logger.info(
+        f"Stale documents found (at least {stale_doc_count}). Generating sync tasks in one batch."
+    )
+
+    logger.info("generate_document_sync_tasks starting for all documents.")
+
+    # Generate all tasks in one pass
+    result = generate_document_sync_tasks(
+        r, max_tasks, celery_app, db_session, lock_beat, tenant_id
+    )
+
+    if result is None:
+        return None
+
+    tasks_generated, total_docs = result
+
+    if tasks_generated >= max_tasks:
+        logger.info(
+            f"generate_document_sync_tasks reached the task generation limit: "
+            f"tasks_generated={tasks_generated} max_tasks={max_tasks}"
+        )
+    else:
+        logger.info(
+            f"generate_document_sync_tasks finished for all documents. "
+            f"tasks_generated={tasks_generated} total_docs_found={total_docs}"
+        )
+
+    set_document_sync_fence(r, tasks_generated)
+    return tasks_generated
diff --git a/backend/onyx/background/celery/tasks/vespa/tasks.py b/backend/onyx/background/celery/tasks/vespa/tasks.py
index 9966d8e5934..cee8a6b0e53 100644
--- a/backend/onyx/background/celery/tasks/vespa/tasks.py
+++ b/backend/onyx/background/celery/tasks/vespa/tasks.py
@@ -20,14 +20,19 @@
 from onyx.background.celery.tasks.shared.tasks import LIGHT_SOFT_TIME_LIMIT
 from onyx.background.celery.tasks.shared.tasks import LIGHT_TIME_LIMIT
 from onyx.background.celery.tasks.shared.tasks import OnyxCeleryTaskCompletionStatus
+from onyx.background.celery.tasks.vespa.document_sync import DOCUMENT_SYNC_FENCE_KEY
+from onyx.background.celery.tasks.vespa.document_sync import get_document_sync_payload
+from onyx.background.celery.tasks.vespa.document_sync import get_document_sync_remaining
+from onyx.background.celery.tasks.vespa.document_sync import reset_document_sync
+from onyx.background.celery.tasks.vespa.document_sync import (
+    try_generate_stale_document_sync_tasks,
+)
 from onyx.configs.app_configs import JOB_TIMEOUT
 from onyx.configs.app_configs import VESPA_SYNC_MAX_TASKS
 from onyx.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT
 from onyx.configs.constants import OnyxCeleryTask
 from onyx.configs.constants import OnyxRedisConstants
 from onyx.configs.constants import OnyxRedisLocks
-from onyx.db.connector_credential_pair import get_connector_credential_pairs
-from onyx.db.document import count_documents_by_needs_sync
 from onyx.db.document import get_document
 from onyx.db.document import mark_document_as_synced
 from onyx.db.document_set import delete_document_set
@@ -47,10 +52,6 @@
 from onyx.document_index.factory import get_default_document_index
 from onyx.document_index.interfaces import VespaDocumentFields
 from onyx.httpx.httpx_pool import HttpxPool
-from onyx.redis.redis_connector_credential_pair import RedisConnectorCredentialPair
-from onyx.redis.redis_connector_credential_pair import (
-    RedisGlobalConnectorCredentialPair,
-)
 from onyx.redis.redis_document_set import RedisDocumentSet
 from onyx.redis.redis_pool import get_redis_client
 from onyx.redis.redis_pool import get_redis_replica_client
@@ -166,8 +167,11 @@ def check_for_vespa_sync_task(self: Task, *, tenant_id: str) -> bool | None:
                 continue
 
             key_str = key_bytes.decode("utf-8")
-            if key_str == RedisGlobalConnectorCredentialPair.FENCE_KEY:
-                monitor_connector_taskset(r)
+            # NOTE: removing the "Redis*" classes, prefer to just have functions to
+            # do these things going forward. In short, things should generally be like the doc
+            # sync task rather than the others
+            if key_str == DOCUMENT_SYNC_FENCE_KEY:
+                monitor_document_sync_taskset(r)
             elif key_str.startswith(RedisDocumentSet.FENCE_PREFIX):
                 with get_session_with_current_tenant() as db_session:
                     monitor_document_set_taskset(tenant_id, key_bytes, r, db_session)
@@ -203,82 +207,6 @@ def check_for_vespa_sync_task(self: Task, *, tenant_id: str) -> bool | None:
     return True
 
 
-def try_generate_stale_document_sync_tasks(
-    celery_app: Celery,
-    max_tasks: int,
-    db_session: Session,
-    r: Redis,
-    lock_beat: RedisLock,
-    tenant_id: str,
-) -> int | None:
-    # the fence is up, do nothing
-
-    redis_global_ccpair = RedisGlobalConnectorCredentialPair(r)
-    if redis_global_ccpair.fenced:
-        return None
-
-    redis_global_ccpair.delete_taskset()
-
-    # add tasks to celery and build up the task set to monitor in redis
-    stale_doc_count = count_documents_by_needs_sync(db_session)
-    if stale_doc_count == 0:
-        return None
-
-    task_logger.info(
-        f"Stale documents found (at least {stale_doc_count}). Generating sync tasks by cc pair."
-    )
-
-    task_logger.info(
-        "RedisConnector.generate_tasks starting by cc_pair. "
-        "Documents spanning multiple cc_pairs will only be synced once."
-    )
-
-    docs_to_skip: set[str] = set()
-
-    # rkuo: we could technically sync all stale docs in one big pass.
-    # but I feel it's more understandable to group the docs by cc_pair
-    total_tasks_generated = 0
-    tasks_remaining = max_tasks
-    cc_pairs = get_connector_credential_pairs(db_session)
-    for cc_pair in cc_pairs:
-        lock_beat.reacquire()
-
-        rc = RedisConnectorCredentialPair(tenant_id, cc_pair.id)
-        rc.set_skip_docs(docs_to_skip)
-        result = rc.generate_tasks(
-            tasks_remaining, celery_app, db_session, r, lock_beat, tenant_id
-        )
-
-        if result is None:
-            continue
-
-        if result[1] == 0:
-            continue
-
-        task_logger.info(
-            f"RedisConnector.generate_tasks finished for single cc_pair. "
-            f"cc_pair={cc_pair.id} tasks_generated={result[0]} tasks_possible={result[1]}"
-        )
-
-        total_tasks_generated += result[0]
-        tasks_remaining -= result[0]
-        if tasks_remaining <= 0:
-            break
-
-    if tasks_remaining <= 0:
-        task_logger.info(
-            f"RedisConnector.generate_tasks reached the task generation limit: "
-            f"total_tasks_generated={total_tasks_generated} max_tasks={max_tasks}"
-        )
-    else:
-        task_logger.info(
-            f"RedisConnector.generate_tasks finished for all cc_pairs. total_tasks_generated={total_tasks_generated}"
-        )
-
-    redis_global_ccpair.set_fence(total_tasks_generated)
-    return total_tasks_generated
-
-
 def try_generate_document_set_sync_tasks(
     celery_app: Celery,
     document_set_id: int,
@@ -433,19 +361,18 @@ def try_generate_user_group_sync_tasks(
     return tasks_generated
 
 
-def monitor_connector_taskset(r: Redis) -> None:
-    redis_global_ccpair = RedisGlobalConnectorCredentialPair(r)
-    initial_count = redis_global_ccpair.payload
+def monitor_document_sync_taskset(r: Redis) -> None:
+    initial_count = get_document_sync_payload(r)
     if initial_count is None:
         return
 
-    remaining = redis_global_ccpair.get_remaining()
+    remaining = get_document_sync_remaining(r)
     task_logger.info(
-        f"Stale document sync progress: remaining={remaining} initial={initial_count}"
+        f"Document sync progress: remaining={remaining} initial={initial_count}"
     )
     if remaining == 0:
-        redis_global_ccpair.reset()
-        task_logger.info(f"Successfully synced stale documents. count={initial_count}")
+        reset_document_sync(r)
+        task_logger.info(f"Successfully synced all documents. count={initial_count}")
 
 
 def monitor_document_set_taskset(
diff --git a/backend/onyx/configs/app_configs.py b/backend/onyx/configs/app_configs.py
index 047e49d7d8b..650ae61fb87 100644
--- a/backend/onyx/configs/app_configs.py
+++ b/backend/onyx/configs/app_configs.py
@@ -332,7 +332,7 @@
 )
 
 # The maximum number of tasks that can be queued up to sync to Vespa in a single pass
-VESPA_SYNC_MAX_TASKS = 1024
+VESPA_SYNC_MAX_TASKS = 8192
 
 DB_YIELD_PER_DEFAULT = 64
 
diff --git a/backend/onyx/db/connector_credential_pair.py b/backend/onyx/db/connector_credential_pair.py
index 7de64dfa56b..040a43ec909 100644
--- a/backend/onyx/db/connector_credential_pair.py
+++ b/backend/onyx/db/connector_credential_pair.py
@@ -1,4 +1,5 @@
 from datetime import datetime
+from enum import Enum
 from typing import TypeVarTuple
 
 from fastapi import HTTPException
@@ -41,6 +42,11 @@
 R = TypeVarTuple("R")
 
 
+class ConnectorType(str, Enum):
+    STANDARD = "standard"
+    USER_FILE = "user_file"
+
+
 def _add_user_filters(
     stmt: Select[tuple[*R]], user: User | None, get_editable: bool = True
 ) -> Select[tuple[*R]]:
@@ -619,14 +625,24 @@ def remove_credential_from_connector(
     )
 
 
-def fetch_connector_credential_pairs(
+def fetch_indexable_connector_credential_pair_ids(
     db_session: Session,
-    include_user_files: bool = False,
-) -> list[ConnectorCredentialPair]:
-    stmt = select(ConnectorCredentialPair)
-    if not include_user_files:
-        stmt = stmt.where(ConnectorCredentialPair.is_user_file != True)  # noqa: E712
-    return list(db_session.scalars(stmt).unique().all())
+    connector_type: ConnectorType | None = None,
+    limit: int | None = None,
+) -> list[int]:
+    stmt = select(ConnectorCredentialPair.id)
+    stmt = stmt.where(
+        ConnectorCredentialPair.status.in_(
+            ConnectorCredentialPairStatus.active_statuses()
+        )
+    )
+    if connector_type == ConnectorType.USER_FILE:
+        stmt = stmt.where(ConnectorCredentialPair.is_user_file.is_(True))
+    elif connector_type == ConnectorType.STANDARD:
+        stmt = stmt.where(ConnectorCredentialPair.is_user_file.is_(False))
+    if limit:
+        stmt = stmt.limit(limit)
+    return list(db_session.scalars(stmt).all())
 
 
 def fetch_connector_credential_pair_for_connector(
diff --git a/backend/onyx/db/document.py b/backend/onyx/db/document.py
index f7d0afecf7f..729cbd4f51e 100644
--- a/backend/onyx/db/document.py
+++ b/backend/onyx/db/document.py
@@ -79,10 +79,6 @@ def count_documents_by_needs_sync(session: Session) -> int:
 
     return (
         session.query(DbDocument.id)
-        .join(
-            DocumentByConnectorCredentialPair,
-            DbDocument.id == DocumentByConnectorCredentialPair.id,
-        )
         .filter(
             or_(
                 DbDocument.last_modified > DbDocument.last_synced,
@@ -93,67 +89,22 @@ def count_documents_by_needs_sync(session: Session) -> int:
     )
 
 
-def construct_document_select_for_connector_credential_pair_by_needs_sync(
-    connector_id: int, credential_id: int
-) -> Select:
-    return (
-        select(DbDocument)
-        .join(
-            DocumentByConnectorCredentialPair,
-            DbDocument.id == DocumentByConnectorCredentialPair.id,
-        )
-        .where(
-            and_(
-                DocumentByConnectorCredentialPair.connector_id == connector_id,
-                DocumentByConnectorCredentialPair.credential_id == credential_id,
-                or_(
-                    DbDocument.last_modified > DbDocument.last_synced,
-                    DbDocument.last_synced.is_(None),
-                ),
-            )
-        )
-    )
-
+def construct_document_id_select_by_needs_sync() -> Select:
+    """Get all document IDs that need syncing across all connector credential pairs.
 
-def construct_document_id_select_for_connector_credential_pair_by_needs_sync(
-    connector_id: int, credential_id: int
-) -> Select:
-    return (
-        select(DbDocument.id)
-        .join(
-            DocumentByConnectorCredentialPair,
-            DbDocument.id == DocumentByConnectorCredentialPair.id,
-        )
-        .where(
-            and_(
-                DocumentByConnectorCredentialPair.connector_id == connector_id,
-                DocumentByConnectorCredentialPair.credential_id == credential_id,
-                or_(
-                    DbDocument.last_modified > DbDocument.last_synced,
-                    DbDocument.last_synced.is_(None),
-                ),
-            )
+    Returns a Select statement for documents where:
+    1. last_modified is newer than last_synced
+    2. last_synced is null (meaning we've never synced)
+    AND the document has a relationship with a connector/credential pair
+    """
+    return select(DbDocument.id).where(
+        or_(
+            DbDocument.last_modified > DbDocument.last_synced,
+            DbDocument.last_synced.is_(None),
         )
     )
 
 
-def get_all_documents_needing_vespa_sync_for_cc_pair(
-    db_session: Session, cc_pair_id: int
-) -> list[DbDocument]:
-    cc_pair = get_connector_credential_pair_from_id(
-        db_session=db_session,
-        cc_pair_id=cc_pair_id,
-    )
-    if not cc_pair:
-        raise ValueError(f"No CC pair found with ID: {cc_pair_id}")
-
-    stmt = construct_document_select_for_connector_credential_pair_by_needs_sync(
-        cc_pair.connector_id, cc_pair.credential_id
-    )
-
-    return list(db_session.scalars(stmt).all())
-
-
 def construct_document_id_select_for_connector_credential_pair(
     connector_id: int, credential_id: int | None = None
 ) -> Select:
diff --git a/backend/onyx/db/enums.py b/backend/onyx/db/enums.py
index 0730096990a..39ef8574233 100644
--- a/backend/onyx/db/enums.py
+++ b/backend/onyx/db/enums.py
@@ -86,12 +86,16 @@ class ConnectorCredentialPairStatus(str, PyEnum):
     DELETING = "DELETING"
     INVALID = "INVALID"
 
+    @classmethod
+    def active_statuses(cls) -> list["ConnectorCredentialPairStatus"]:
+        return [
+            ConnectorCredentialPairStatus.ACTIVE,
+            ConnectorCredentialPairStatus.SCHEDULED,
+            ConnectorCredentialPairStatus.INITIAL_INDEXING,
+        ]
+
     def is_active(self) -> bool:
-        return (
-            self == ConnectorCredentialPairStatus.ACTIVE
-            or self == ConnectorCredentialPairStatus.SCHEDULED
-            or self == ConnectorCredentialPairStatus.INITIAL_INDEXING
-        )
+        return self in self.active_statuses()
 
 
 class AccessType(str, PyEnum):
diff --git a/backend/onyx/redis/redis_connector_credential_pair.py b/backend/onyx/redis/redis_connector_credential_pair.py
deleted file mode 100644
index 5bbbd2e08f2..00000000000
--- a/backend/onyx/redis/redis_connector_credential_pair.py
+++ /dev/null
@@ -1,207 +0,0 @@
-import time
-from typing import cast
-from uuid import uuid4
-
-import redis
-from celery import Celery
-from redis import Redis
-from redis.lock import Lock as RedisLock
-from sqlalchemy.orm import Session
-
-from onyx.configs.app_configs import DB_YIELD_PER_DEFAULT
-from onyx.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT
-from onyx.configs.constants import OnyxCeleryPriority
-from onyx.configs.constants import OnyxCeleryQueues
-from onyx.configs.constants import OnyxCeleryTask
-from onyx.configs.constants import OnyxRedisConstants
-from onyx.db.connector_credential_pair import get_connector_credential_pair_from_id
-from onyx.db.document import (
-    construct_document_id_select_for_connector_credential_pair_by_needs_sync,
-)
-from onyx.redis.redis_object_helper import RedisObjectHelper
-
-
-class RedisConnectorCredentialPair(RedisObjectHelper):
-    """This class is used to scan documents by cc_pair in the db and collect them into
-    a unified set for syncing.
-
-    It differs from the other redis helpers in that the taskset used spans
-    all connectors and is not per connector."""
-
-    PREFIX = "connectorsync"
-    TASKSET_PREFIX = PREFIX + "_taskset"
-
-    def __init__(self, tenant_id: str, id: int) -> None:
-        super().__init__(tenant_id, str(id))
-
-        # documents that should be skipped
-        self.skip_docs: set[str] = set()
-
-    @classmethod
-    def get_taskset_key(cls) -> str:
-        return RedisConnectorCredentialPair.TASKSET_PREFIX
-
-    @property
-    def taskset_key(self) -> str:
-        """Notice that this is intentionally reusing the same taskset for all
-        connector syncs"""
-        # example: connectorsync_taskset
-        return f"{self.TASKSET_PREFIX}"
-
-    def set_skip_docs(self, skip_docs: set[str]) -> None:
-        # documents that should be skipped. Note that this class updates
-        # the list on the fly
-        self.skip_docs = skip_docs
-
-    def generate_tasks(
-        self,
-        max_tasks: int,
-        celery_app: Celery,
-        db_session: Session,
-        redis_client: Redis,
-        lock: RedisLock,
-        tenant_id: str,
-    ) -> tuple[int, int] | None:
-        """We can limit the number of tasks generated here, which is useful to prevent
-        one tenant from overwhelming the sync queue.
-
-        This works because the dirty state of a document is in the DB, so more docs
-        get picked up after the limited set of tasks is complete.
-        """
-
-        last_lock_time = time.monotonic()
-
-        num_tasks_sent = 0
-
-        cc_pair = get_connector_credential_pair_from_id(
-            db_session=db_session,
-            cc_pair_id=int(self._id),
-        )
-        if not cc_pair:
-            return None
-
-        stmt = construct_document_id_select_for_connector_credential_pair_by_needs_sync(
-            cc_pair.connector_id, cc_pair.credential_id
-        )
-
-        num_docs = 0
-
-        for doc_id in db_session.scalars(stmt).yield_per(DB_YIELD_PER_DEFAULT):
-            doc_id = cast(str, doc_id)
-            current_time = time.monotonic()
-            if current_time - last_lock_time >= (
-                CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT / 4
-            ):
-                lock.reacquire()
-                last_lock_time = current_time
-
-            num_docs += 1
-
-            # check if we should skip the document (typically because it's already syncing)
-            if doc_id in self.skip_docs:
-                continue
-
-            # celery's default task id format is "dd32ded3-00aa-4884-8b21-42f8332e7fac"
-            # the key for the result is "celery-task-meta-dd32ded3-00aa-4884-8b21-42f8332e7fac"
-            # we prefix the task id so it's easier to keep track of who created the task
-            # aka "documentset_1_6dd32ded3-00aa-4884-8b21-42f8332e7fac"
-            custom_task_id = f"{self.task_id_prefix}_{uuid4()}"
-
-            # add to the tracking taskset in redis BEFORE creating the celery task.
-            # note that for the moment we are using a single taskset key, not differentiated by cc_pair id
-            redis_client.sadd(
-                RedisConnectorCredentialPair.get_taskset_key(), custom_task_id
-            )
-
-            # Priority on sync's triggered by new indexing should be medium
-            celery_app.send_task(
-                OnyxCeleryTask.VESPA_METADATA_SYNC_TASK,
-                kwargs=dict(document_id=doc_id, tenant_id=tenant_id),
-                queue=OnyxCeleryQueues.VESPA_METADATA_SYNC,
-                task_id=custom_task_id,
-                priority=OnyxCeleryPriority.MEDIUM,
-                ignore_result=True,
-            )
-
-            num_tasks_sent += 1
-            self.skip_docs.add(doc_id)
-
-            if num_tasks_sent >= max_tasks:
-                break
-
-        return num_tasks_sent, num_docs
-
-
-class RedisGlobalConnectorCredentialPair:
-    """This class is used to scan documents by cc_pair in the db and collect them into
-    a unified set for syncing.
-
-    It differs from the other redis helpers in that the taskset used spans
-    all connectors and is not per connector."""
-
-    PREFIX = "connectorsync"
-    FENCE_KEY = PREFIX + "_fence"
-    TASKSET_KEY = PREFIX + "_taskset"
-
-    def __init__(self, redis: redis.Redis) -> None:
-        self.redis = redis
-
-    @property
-    def fenced(self) -> bool:
-        if self.redis.exists(self.fence_key):
-            return True
-
-        return False
-
-    @property
-    def payload(self) -> int | None:
-        bytes = self.redis.get(self.fence_key)
-        if bytes is None:
-            return None
-
-        progress = int(cast(int, bytes))
-        return progress
-
-    def get_remaining(self) -> int:
-        remaining = cast(int, self.redis.scard(self.taskset_key))
-        return remaining
-
-    @property
-    def fence_key(self) -> str:
-        """Notice that this is intentionally reusing the same fence for all
-        connector syncs"""
-        # example: connectorsync_fence
-        return f"{self.FENCE_KEY}"
-
-    @property
-    def taskset_key(self) -> str:
-        """Notice that this is intentionally reusing the same taskset for all
-        connector syncs"""
-        # example: connectorsync_taskset
-        return f"{self.TASKSET_KEY}"
-
-    def set_fence(self, payload: int | None) -> None:
-        if payload is None:
-            self.redis.srem(OnyxRedisConstants.ACTIVE_FENCES, self.fence_key)
-            self.redis.delete(self.fence_key)
-            return
-
-        self.redis.set(self.fence_key, payload)
-        self.redis.sadd(OnyxRedisConstants.ACTIVE_FENCES, self.fence_key)
-
-    def delete_taskset(self) -> None:
-        self.redis.delete(self.taskset_key)
-
-    def reset(self) -> None:
-        self.redis.srem(OnyxRedisConstants.ACTIVE_FENCES, self.fence_key)
-        self.redis.delete(self.taskset_key)
-        self.redis.delete(self.fence_key)
-
-    @staticmethod
-    def reset_all(r: redis.Redis) -> None:
-        r.srem(
-            OnyxRedisConstants.ACTIVE_FENCES,
-            RedisGlobalConnectorCredentialPair.FENCE_KEY,
-        )
-        r.delete(RedisGlobalConnectorCredentialPair.TASKSET_KEY)
-        r.delete(RedisGlobalConnectorCredentialPair.FENCE_KEY)
diff --git a/backend/onyx/redis/redis_utils.py b/backend/onyx/redis/redis_utils.py
index d311ca84eea..1403238513a 100644
--- a/backend/onyx/redis/redis_utils.py
+++ b/backend/onyx/redis/redis_utils.py
@@ -1,6 +1,3 @@
-from onyx.redis.redis_connector_credential_pair import (
-    RedisGlobalConnectorCredentialPair,
-)
 from onyx.redis.redis_connector_delete import RedisConnectorDelete
 from onyx.redis.redis_connector_doc_perm_sync import RedisConnectorPermissionSync
 from onyx.redis.redis_connector_index import RedisConnectorIndex
@@ -11,8 +8,6 @@
 
 def is_fence(key_bytes: bytes) -> bool:
     key_str = key_bytes.decode("utf-8")
-    if key_str == RedisGlobalConnectorCredentialPair.FENCE_KEY:
-        return True
     if key_str.startswith(RedisDocumentSet.FENCE_PREFIX):
         return True
     if key_str.startswith(RedisUserGroup.FENCE_PREFIX):

From 7dbe4ed50a59f6f7bde680a108a1c0945fb64b3b Mon Sep 17 00:00:00 2001
From: Chris Weaver <chris@onyx.app>
Date: Fri, 18 Jul 2025 14:16:10 -0700
Subject: [PATCH 10/78] fix: improve assistant fetching efficiency (#5047)

* Improve assistant fetching efficiency

* More fix

* Fix weird build stuff

* Improve
---
 .../usage/PersonaMessagesChart.tsx            | 13 ++--
 .../app/ee/admin/performance/usage/page.tsx   |  7 ++-
 web/src/app/layout.tsx                        |  9 +--
 web/src/components/context/AppProvider.tsx    | 10 +---
 .../components/context/AssistantsContext.tsx  | 59 +------------------
 web/src/lib/chat/fetchAssistantdata.ts        | 55 ++---------------
 6 files changed, 24 insertions(+), 129 deletions(-)

diff --git a/web/src/app/ee/admin/performance/usage/PersonaMessagesChart.tsx b/web/src/app/ee/admin/performance/usage/PersonaMessagesChart.tsx
index f084657e7a7..a90fa8f6985 100644
--- a/web/src/app/ee/admin/performance/usage/PersonaMessagesChart.tsx
+++ b/web/src/app/ee/admin/performance/usage/PersonaMessagesChart.tsx
@@ -5,7 +5,6 @@ import {
   usePersonaMessages,
   usePersonaUniqueUsers,
 } from "../lib";
-import { useAssistants } from "@/components/context/AssistantsContext";
 import { DateRangePickerValue } from "@/components/dateRangeSelectors/AdminDateRangeSelector";
 import Text from "@/components/ui/text";
 import Title from "@/components/ui/title";
@@ -19,10 +18,13 @@ import {
   SelectValue,
 } from "@/components/ui/select";
 import { useState, useMemo, useEffect } from "react";
+import { Persona } from "@/app/admin/assistants/interfaces";
 
 export function PersonaMessagesChart({
+  availablePersonas,
   timeRange,
 }: {
+  availablePersonas: Persona[];
   timeRange: DateRangePickerValue;
 }) {
   const [selectedPersonaId, setSelectedPersonaId] = useState<
@@ -30,7 +32,6 @@ export function PersonaMessagesChart({
   >(undefined);
   const [searchQuery, setSearchQuery] = useState("");
   const [highlightedIndex, setHighlightedIndex] = useState(-1);
-  const { allAssistants: personaList } = useAssistants();
 
   const {
     data: personaMessagesData,
@@ -48,11 +49,11 @@ export function PersonaMessagesChart({
   const hasError = personaMessagesError || personaUniqueUsersError;
 
   const filteredPersonaList = useMemo(() => {
-    if (!personaList) return [];
-    return personaList.filter((persona) =>
+    if (!availablePersonas) return [];
+    return availablePersonas.filter((persona) =>
       persona.name.toLowerCase().includes(searchQuery.toLowerCase())
     );
-  }, [personaList, searchQuery]);
+  }, [availablePersonas, searchQuery]);
 
   const handleKeyDown = (e: React.KeyboardEvent) => {
     e.stopPropagation();
@@ -142,7 +143,7 @@ export function PersonaMessagesChart({
         <ThreeDotsLoader />
       </div>
     );
-  } else if (!personaList || hasError) {
+  } else if (!availablePersonas || hasError) {
     content = (
       <div className="h-80 text-red-600 text-bold flex flex-col">
         <p className="m-auto">Failed to fetch data...</p>
diff --git a/web/src/app/ee/admin/performance/usage/page.tsx b/web/src/app/ee/admin/performance/usage/page.tsx
index ec5604298a2..937983a5a4d 100644
--- a/web/src/app/ee/admin/performance/usage/page.tsx
+++ b/web/src/app/ee/admin/performance/usage/page.tsx
@@ -10,9 +10,11 @@ import { AdminPageTitle } from "@/components/admin/Title";
 import { FiActivity } from "react-icons/fi";
 import UsageReports from "./UsageReports";
 import { Separator } from "@/components/ui/separator";
+import { useAdminPersonas } from "@/app/admin/assistants/hooks";
 
 export default function AnalyticsPage() {
   const [timeRange, setTimeRange] = useTimeRange();
+  const { personas } = useAdminPersonas();
 
   return (
     <main className="pt-4 mx-auto container">
@@ -27,7 +29,10 @@ export default function AnalyticsPage() {
       <QueryPerformanceChart timeRange={timeRange} />
       <FeedbackChart timeRange={timeRange} />
       <OnyxBotChart timeRange={timeRange} />
-      <PersonaMessagesChart timeRange={timeRange} />
+      <PersonaMessagesChart
+        availablePersonas={personas}
+        timeRange={timeRange}
+      />
       <Separator />
       <UsageReports />
     </main>
diff --git a/web/src/app/layout.tsx b/web/src/app/layout.tsx
index 707367a459d..d68a3b756c6 100644
--- a/web/src/app/layout.tsx
+++ b/web/src/app/layout.tsx
@@ -17,7 +17,6 @@ import {
   EnterpriseSettings,
   ApplicationStatus,
 } from "./admin/settings/interfaces";
-import { fetchAssistantData } from "@/lib/chat/fetchAssistantdata";
 import { AppProvider } from "@/components/context/AppProvider";
 import { PHProvider } from "./providers";
 import { getAuthTypeMetadataSS, getCurrentUserSS } from "@/lib/userSS";
@@ -31,6 +30,7 @@ import { DocumentsProvider } from "./chat/my-documents/DocumentsContext";
 import CloudError from "@/components/errorPages/CloudErrorPage";
 import Error from "@/components/errorPages/ErrorPage";
 import AccessRestrictedPage from "@/components/errorPages/AccessRestrictedPage";
+import { fetchAssistantData } from "@/lib/chat/fetchAssistantdata";
 
 const inter = Inter({
   subsets: ["latin"],
@@ -71,7 +71,7 @@ export default async function RootLayout({
 }: {
   children: React.ReactNode;
 }) {
-  const [combinedSettings, assistantsData, user, authTypeMetadata] =
+  const [combinedSettings, assistants, user, authTypeMetadata] =
     await Promise.all([
       fetchSettingsSS(),
       fetchAssistantData(),
@@ -145,17 +145,12 @@ export default async function RootLayout({
     );
   }
 
-  const { assistants, hasAnyConnectors, hasImageCompatibleModel } =
-    assistantsData;
-
   return getPageContent(
     <AppProvider
       authTypeMetadata={authTypeMetadata}
       user={user}
       settings={combinedSettings}
       assistants={assistants}
-      hasAnyConnectors={hasAnyConnectors}
-      hasImageCompatibleModel={hasImageCompatibleModel}
     >
       <DocumentsProvider>
         <Suspense fallback={null}>
diff --git a/web/src/components/context/AppProvider.tsx b/web/src/components/context/AppProvider.tsx
index 8e693336a9d..53b9c957097 100644
--- a/web/src/components/context/AppProvider.tsx
+++ b/web/src/components/context/AppProvider.tsx
@@ -14,8 +14,6 @@ interface AppProviderProps {
   user: User | null;
   settings: CombinedSettings;
   assistants: MinimalPersonaSnapshot[];
-  hasAnyConnectors: boolean;
-  hasImageCompatibleModel: boolean;
   authTypeMetadata: AuthTypeMetadata;
 }
 
@@ -24,8 +22,6 @@ export const AppProvider = ({
   user,
   settings,
   assistants,
-  hasAnyConnectors,
-  hasImageCompatibleModel,
   authTypeMetadata,
 }: AppProviderProps) => {
   return (
@@ -36,11 +32,7 @@ export const AppProvider = ({
         authTypeMetadata={authTypeMetadata}
       >
         <ProviderContextProvider>
-          <AssistantsProvider
-            initialAssistants={assistants}
-            hasAnyConnectors={hasAnyConnectors}
-            hasImageCompatibleModel={hasImageCompatibleModel}
-          >
+          <AssistantsProvider initialAssistants={assistants}>
             <ModalProvider user={user}>{children}</ModalProvider>
           </AssistantsProvider>
         </ProviderContextProvider>
diff --git a/web/src/components/context/AssistantsContext.tsx b/web/src/components/context/AssistantsContext.tsx
index 950f6e7ba75..7bb48c73194 100644
--- a/web/src/components/context/AssistantsContext.tsx
+++ b/web/src/components/context/AssistantsContext.tsx
@@ -25,9 +25,6 @@ interface AssistantsContextProps {
   ownedButHiddenAssistants: MinimalPersonaSnapshot[];
   refreshAssistants: () => Promise<void>;
   isImageGenerationAvailable: boolean;
-  // Admin only
-  editablePersonas: MinimalPersonaSnapshot[];
-  allAssistants: MinimalPersonaSnapshot[];
   pinnedAssistants: MinimalPersonaSnapshot[];
   setPinnedAssistants: Dispatch<SetStateAction<MinimalPersonaSnapshot[]>>;
 }
@@ -41,22 +38,11 @@ export const AssistantsProvider: React.FC<{
   initialAssistants: MinimalPersonaSnapshot[];
   hasAnyConnectors?: boolean;
   hasImageCompatibleModel?: boolean;
-}> = ({
-  children,
-  initialAssistants,
-  hasAnyConnectors,
-  hasImageCompatibleModel,
-}) => {
+}> = ({ children, initialAssistants }) => {
   const [assistants, setAssistants] = useState<MinimalPersonaSnapshot[]>(
     initialAssistants || []
   );
-  const { user, isAdmin, isCurator } = useUser();
-  const [editablePersonas, setEditablePersonas] = useState<
-    MinimalPersonaSnapshot[]
-  >([]);
-  const [allAssistants, setAllAssistants] = useState<MinimalPersonaSnapshot[]>(
-    []
-  );
+  const { user } = useUser();
 
   const [pinnedAssistants, setPinnedAssistants] = useState<
     MinimalPersonaSnapshot[]
@@ -107,37 +93,6 @@ export const AssistantsProvider: React.FC<{
     checkImageGenerationAvailability();
   }, []);
 
-  const fetchPersonas = async () => {
-    if (!isAdmin && !isCurator) {
-      return;
-    }
-
-    try {
-      const [editableResponse, allResponse] = await Promise.all([
-        fetch("/api/admin/persona?get_editable=true"),
-        fetch("/api/admin/persona"),
-      ]);
-
-      if (editableResponse.ok) {
-        const editablePersonas = await editableResponse.json();
-        setEditablePersonas(editablePersonas);
-      }
-
-      if (allResponse.ok) {
-        const allPersonas = await allResponse.json();
-        setAllAssistants(allPersonas);
-      } else {
-        console.error("Error fetching personas:", allResponse);
-      }
-    } catch (error) {
-      console.error("Error fetching personas:", error);
-    }
-  };
-
-  useEffect(() => {
-    fetchPersonas();
-  }, [isAdmin, isCurator]);
-
   const refreshAssistants = async () => {
     try {
       const response = await fetch("/api/persona", {
@@ -148,13 +103,7 @@ export const AssistantsProvider: React.FC<{
       });
       if (!response.ok) throw new Error("Failed to fetch assistants");
       let assistants: MinimalPersonaSnapshot[] = await response.json();
-
-      let filteredAssistants = filterAssistants(assistants);
-
-      setAssistants(filteredAssistants);
-
-      // Fetch and update allAssistants for admins and curators
-      await fetchPersonas();
+      setAssistants(filterAssistants(assistants));
     } catch (error) {
       console.error("Error refreshing assistants:", error);
     }
@@ -197,8 +146,6 @@ export const AssistantsProvider: React.FC<{
         finalAssistants,
         ownedButHiddenAssistants,
         refreshAssistants,
-        editablePersonas,
-        allAssistants,
         isImageGenerationAvailable,
         setPinnedAssistants,
         pinnedAssistants,
diff --git a/web/src/lib/chat/fetchAssistantdata.ts b/web/src/lib/chat/fetchAssistantdata.ts
index f17b70de593..7c76fa932f5 100644
--- a/web/src/lib/chat/fetchAssistantdata.ts
+++ b/web/src/lib/chat/fetchAssistantdata.ts
@@ -1,65 +1,20 @@
-import { fetchSS } from "@/lib/utilsSS";
 import { MinimalPersonaSnapshot } from "@/app/admin/assistants/interfaces";
-import { fetchLLMProvidersSS } from "@/lib/llm/fetchLLMs";
 import { fetchAssistantsSS } from "../assistants/fetchAssistantsSS";
-import { modelSupportsImageInput } from "../llm/utils";
 import { filterAssistants } from "../assistants/utils";
 
-interface AssistantData {
-  assistants: MinimalPersonaSnapshot[];
-  hasAnyConnectors: boolean;
-  hasImageCompatibleModel: boolean;
-}
-export async function fetchAssistantData(): Promise<AssistantData> {
-  // Default state if anything fails
-  const defaultState: AssistantData = {
-    assistants: [],
-    hasAnyConnectors: false,
-    hasImageCompatibleModel: false,
-  };
-
+export async function fetchAssistantData(): Promise<MinimalPersonaSnapshot[]> {
   try {
-    // Fetch core assistants data first
+    // Fetch core assistants data
     const [assistants, assistantsFetchError] = await fetchAssistantsSS();
     if (assistantsFetchError) {
       // This is not a critical error and occurs when the user is not logged in
       console.warn(`Failed to fetch assistants - ${assistantsFetchError}`);
-      return defaultState;
+      return [];
     }
 
-    // Parallel fetch of additional data
-    const [ccPairsResponse, llmProviders] = await Promise.all([
-      fetchSS("/manage/connector-status").catch((error) => {
-        console.error("Failed to fetch connectors:", error);
-        return null;
-      }),
-      fetchLLMProvidersSS().catch((error) => {
-        console.error("Failed to fetch LLM providers:", error);
-        return [];
-      }),
-    ]);
-
-    const hasAnyConnectors = ccPairsResponse?.ok
-      ? (await ccPairsResponse.json()).length > 0
-      : false;
-
-    const hasImageCompatibleModel = llmProviders.some(
-      (provider) =>
-        provider.provider === "openai" ||
-        provider.model_configurations.some((modelConfiguration) =>
-          modelSupportsImageInput(llmProviders, modelConfiguration.name)
-        )
-    );
-
-    let filteredAssistants = filterAssistants(assistants);
-
-    return {
-      assistants: filteredAssistants,
-      hasAnyConnectors,
-      hasImageCompatibleModel,
-    };
+    return filterAssistants(assistants);
   } catch (error) {
     console.error("Unexpected error in fetchAssistantData:", error);
-    return defaultState;
+    return [];
   }
 }

From d1d8626b405d49527a0beab6919344c8de852e36 Mon Sep 17 00:00:00 2001
From: joachim-danswer <joachim@danswer.ai>
Date: Fri, 18 Jul 2025 16:15:11 -0700
Subject: [PATCH 11/78] feat: KG improvements (#5048)

* improvements

* drop views if SQL fails

* mypy fix
---
 .../kb_search/nodes/a3_generate_simple_sql.py | 48 +++++++++++++++++--
 backend/onyx/db/kg_temp_view.py               |  9 ++--
 2 files changed, 49 insertions(+), 8 deletions(-)

diff --git a/backend/onyx/agents/agent_search/kb_search/nodes/a3_generate_simple_sql.py b/backend/onyx/agents/agent_search/kb_search/nodes/a3_generate_simple_sql.py
index 53f823017bc..181a15fcee2 100644
--- a/backend/onyx/agents/agent_search/kb_search/nodes/a3_generate_simple_sql.py
+++ b/backend/onyx/agents/agent_search/kb_search/nodes/a3_generate_simple_sql.py
@@ -203,6 +203,8 @@ def generate_simple_sql(
     if state.kg_entity_temp_view_name is None:
         raise ValueError("kg_entity_temp_view_name is not set")
 
+    sql_statement_display: str | None = None
+
     ## STEP 3 - articulate goals
 
     stream_write_step_activities(writer, _KG_STEP_NR)
@@ -381,7 +383,18 @@ def generate_simple_sql(
 
                 raise e
 
-        logger.debug(f"A3 - sql_statement after correction: {sql_statement}")
+        # display sql statement with view names replaced by general view names
+        sql_statement_display = sql_statement.replace(
+            state.kg_doc_temp_view_name, "<your_allowed_docs_view_name>"
+        )
+        sql_statement_display = sql_statement_display.replace(
+            state.kg_rel_temp_view_name, "<your_relationship_view_name>"
+        )
+        sql_statement_display = sql_statement_display.replace(
+            state.kg_entity_temp_view_name, "<your_entity_view_name>"
+        )
+
+        logger.debug(f"A3 - sql_statement after correction: {sql_statement_display}")
 
         # Get SQL for source documents
 
@@ -409,7 +422,20 @@ def generate_simple_sql(
                     "relationship_table", rel_temp_view
                 )
 
-            logger.debug(f"A3 source_documents_sql: {source_documents_sql}")
+            if source_documents_sql:
+                source_documents_sql_display = source_documents_sql.replace(
+                    state.kg_doc_temp_view_name, "<your_allowed_docs_view_name>"
+                )
+                source_documents_sql_display = source_documents_sql_display.replace(
+                    state.kg_rel_temp_view_name, "<your_relationship_view_name>"
+                )
+                source_documents_sql_display = source_documents_sql_display.replace(
+                    state.kg_entity_temp_view_name, "<your_entity_view_name>"
+                )
+            else:
+                source_documents_sql_display = "(No source documents SQL generated)"
+
+            logger.debug(f"A3 source_documents_sql: {source_documents_sql_display}")
 
         scalar_result = None
         query_results = None
@@ -435,7 +461,13 @@ def generate_simple_sql(
                     rows = result.fetchall()
                     query_results = [dict(row._mapping) for row in rows]
             except Exception as e:
+                # TODO: raise error on frontend
                 logger.error(f"Error executing SQL query: {e}")
+                drop_views(
+                    allowed_docs_view_name=doc_temp_view,
+                    kg_relationships_view_name=rel_temp_view,
+                    kg_entity_view_name=ent_temp_view,
+                )
 
                 raise e
 
@@ -459,8 +491,14 @@ def generate_simple_sql(
                         for source_document_result in query_source_document_results
                     ]
                 except Exception as e:
-                    # No stopping here, the individualized SQL query is not mandatory
                     # TODO: raise error on frontend
+
+                    drop_views(
+                        allowed_docs_view_name=doc_temp_view,
+                        kg_relationships_view_name=rel_temp_view,
+                        kg_entity_view_name=ent_temp_view,
+                    )
+
                     logger.error(f"Error executing Individualized SQL query: {e}")
 
         else:
@@ -493,11 +531,11 @@ def generate_simple_sql(
     if reasoning:
         stream_write_step_answer_explicit(writer, step_nr=_KG_STEP_NR, answer=reasoning)
 
-    if main_sql_statement:
+    if sql_statement_display:
         stream_write_step_answer_explicit(
             writer,
             step_nr=_KG_STEP_NR,
-            answer=f" \n Generated SQL: {main_sql_statement}",
+            answer=f" \n Generated SQL: {sql_statement_display}",
         )
 
     stream_close_step_answer(writer, _KG_STEP_NR)
diff --git a/backend/onyx/db/kg_temp_view.py b/backend/onyx/db/kg_temp_view.py
index 5a956ea53e2..baa4b012141 100644
--- a/backend/onyx/db/kg_temp_view.py
+++ b/backend/onyx/db/kg_temp_view.py
@@ -1,3 +1,5 @@
+import random
+
 from sqlalchemy import text
 from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.orm import Session
@@ -17,10 +19,11 @@ def get_user_view_names(user_email: str, tenant_id: str) -> KGViewNames:
     user_email_cleaned = (
         user_email.replace("@", "__").replace(".", "_").replace("+", "_")
     )
+    random_suffix_str = str(random.randint(1000000, 9999999))
     return KGViewNames(
-        allowed_docs_view_name=f'"{tenant_id}".{KG_TEMP_ALLOWED_DOCS_VIEW_NAME_PREFIX}_{user_email_cleaned}',
-        kg_relationships_view_name=f'"{tenant_id}".{KG_TEMP_KG_RELATIONSHIPS_VIEW_NAME_PREFIX}_{user_email_cleaned}',
-        kg_entity_view_name=f'"{tenant_id}".{KG_TEMP_KG_ENTITIES_VIEW_NAME_PREFIX}_{user_email_cleaned}',
+        allowed_docs_view_name=f'"{tenant_id}".{KG_TEMP_ALLOWED_DOCS_VIEW_NAME_PREFIX}_{user_email_cleaned}_{random_suffix_str}',
+        kg_relationships_view_name=f'"{tenant_id}".{KG_TEMP_KG_RELATIONSHIPS_VIEW_NAME_PREFIX}_{user_email_cleaned}_{random_suffix_str}',
+        kg_entity_view_name=f'"{tenant_id}".{KG_TEMP_KG_ENTITIES_VIEW_NAME_PREFIX}_{user_email_cleaned}_{random_suffix_str}',
     )
 
 
From 2b856d40d4574d38cff079b95fffb5039c561cc6 Mon Sep 17 00:00:00 2001
From: Rei Meguro <36625832+Orbital-Web@users.noreply.github.com>
Date: Sat, 19 Jul 2025 10:51:51 +0900
Subject: [PATCH 12/78] feat: Search and Answer Quality Test Script (#4974)

* aefads

* search quality tests improvement

Co-authored-by: wenxi-onyx <wenxi@onyx.app>

* nits

* refactor: config refactor

* document context + skip genai fix

* feat: answer eval

* more error messages

* mypy ragas

* mypy

* small fixes

* feat: more metrics

* fix

* feat: grab content

* typing

* feat: lazy updates

* mypy

* all at front

* feat: answer correctness

* use api key so it works with auth enabled

* update readme

* feat: auto add path

* feat: rate limit

* fix: readme + remove rerank all

* fix: raise exception immediately

* docs: improved clarity

* feat: federated handling

* fix: mypy

* nits

---------

Co-authored-by: wenxi-onyx <wenxi@onyx.app>
---
 backend/onyx/chat/process_message.py          |   1 +
 .../tests/regression/search_quality/README.md |  60 +-
 .../tests/regression/search_quality/models.py |  82 ++
 .../search_quality/run_search_eval.py         | 822 +++++++++++++++---
 .../search_eval_config.yaml.template          |  16 -
 .../search_quality/test_queries.json.template |  10 +-
 .../regression/search_quality/util_config.py  |  75 --
 .../regression/search_quality/util_data.py    | 166 ----
 .../regression/search_quality/util_eval.py    |  94 --
 .../search_quality/util_retrieve.py           |  88 --
 .../tests/regression/search_quality/utils.py  | 208 +++++
 11 files changed, 1023 insertions(+), 599 deletions(-)
 create mode 100644 backend/tests/regression/search_quality/models.py
 delete mode 100644 backend/tests/regression/search_quality/search_eval_config.yaml.template
 delete mode 100644 backend/tests/regression/search_quality/util_config.py
 delete mode 100644 backend/tests/regression/search_quality/util_data.py
 delete mode 100644 backend/tests/regression/search_quality/util_eval.py
 delete mode 100644 backend/tests/regression/search_quality/util_retrieve.py
 create mode 100644 backend/tests/regression/search_quality/utils.py

diff --git a/backend/onyx/chat/process_message.py b/backend/onyx/chat/process_message.py
index 1a7f64b5003..dbb04d3a963 100644
--- a/backend/onyx/chat/process_message.py
+++ b/backend/onyx/chat/process_message.py
@@ -1012,6 +1012,7 @@ def create_response(
             tools=tools,
             db_session=db_session,
             use_agentic_search=new_msg_req.use_agentic_search,
+            skip_gen_ai_answer_generation=new_msg_req.skip_gen_ai_answer_generation,
         )
 
         info_by_subq: dict[SubQuestionKey, AnswerPostInfo] = defaultdict(
diff --git a/backend/tests/regression/search_quality/README.md b/backend/tests/regression/search_quality/README.md
index db35cf972fd..c4eb87d5ac0 100644
--- a/backend/tests/regression/search_quality/README.md
+++ b/backend/tests/regression/search_quality/README.md
@@ -1,62 +1,50 @@
 # Search Quality Test Script
 
-This Python script evaluates the search results for a list of queries.
-
-This script will likely get refactored in the future as an API endpoint.
-In the meanwhile, it is used to evaluate the search quality using locally ingested documents.
-The key differentiating factor with `answer_quality` is that it can evaluate results without explicit "ground truth" using the reranker as a reference.
+This Python script evaluates the search and answer quality for a list of queries, against a ground truth. It will use the currently ingested documents for the search, answer generation, and ground truth comparisons.
 
 ## Usage
 
 1. Ensure you have the required dependencies installed and onyx running.
 
-2. Ensure a reranker model is configured in the search settings.
-This can be checked/modified by opening the admin panel, going to search settings, and ensuring a reranking model is set.
-
-3. Set up the PYTHONPATH permanently:
-   Add the following line to your shell configuration file (e.g., `~/.bashrc`, `~/.zshrc`, or `~/.bash_profile`):
-   ```
-   export PYTHONPATH=$PYTHONPATH:/path/to/onyx/backend
-   ```
-   Replace `/path/to/onyx` with the actual path to your Onyx repository.
-   After adding this line, restart your terminal or run `source ~/.bashrc` (or the appropriate config file) to apply the changes.
+2. Ensure you have `OPENAI_API_KEY` set if you intend to do answer evaluation (enabled by default, unless you run the script with the `-s` flag). Also, if you're not using `AUTH_TYPE=disabled`, go to the API Keys page in the admin panel, generate a basic api token, and add it to the env file as `ONYX_API_KEY=on_...`.
 
-4. Navigate to Onyx repo, search_quality folder:
+3. Navigate to Onyx repo, **search_quality** folder:
 
 ```
 cd path/to/onyx/backend/tests/regression/search_quality
 ```
 
-5. Copy `test_queries.json.template` to `test_queries.json` and add/remove test queries in it. The possible fields are:
+4. Copy `test_queries.json.template` to `test_queries.json` and add/remove test queries in it. The fields for each query are:
 
    - `question: str` the query
-   - `question_search: Optional[str]` modified query specifically for the search step
-   - `ground_truth: Optional[list[GroundTruth]]` a ranked list of expected search results with fields:
-      - `doc_source: str` document source (e.g., Web, Drive, Linear), currently unused
+   - `ground_truth: list[GroundTruth]` an un-ranked list of expected search results with fields:
+      - `doc_source: str` document source (e.g., web, google_drive, linear), used to normalize the links in some cases
       - `doc_link: str` link associated with document, used to find corresponding document in local index
+   - `ground_truth_response: Optional[str]` a response with clauses the ideal answer should include
    - `categories: Optional[list[str]]` list of categories, used to aggregate evaluation results
 
-6. Copy `search_eval_config.yaml.template` to `search_eval_config.yaml` and specify the search and eval parameters
-
-7. Run `run_search_eval.py` to run the search and evaluate the search results
+5. Run `run_search_eval.py` to evaluate the queries.  All parameters are optional and have sensible defaults:
 
 ```
 python run_search_eval.py
+  -d --dataset          # Path to the test-set JSON file (default: ./test_queries.json)
+  -n --num_search       # Maximum number of documents to retrieve per search (default: 50)
+  -a --num_answer       # Maximum number of documents to use for answer evaluation (default: 25)
+  -w --max_workers      # Maximum number of concurrent search requests (0 = unlimited, default: 10).
+  -r --max_req_rate     # Maximum number of search requests per minute (0 = unlimited, default: 0).
+  -q --timeout          # Request timeout in seconds (default: 120)
+  -e --api_endpoint     # Base URL of the Onyx API server (default: http://127.0.0.1:8080)
+  -s --search_only      # Only perform search and not answer evaluation (default: false)
+  -t --tenant_id        # Tenant ID to use for the evaluation (default: None)
 ```
 
-8. Optionally, save the generated `test_queries.json` in the export folder to reuse the generated `question_search`, and rerun the search evaluation with alternative search parameters.
-
-## Metrics
-There are two main metrics currently implemented:
-- ratio_topk: the ratio of documents in the comparison set that are in the topk search results (higher is better, 0-1)
-- avg_rank_delta: the average rank difference between the comparison set and search results (lower is better, 0-inf)
-
-Ratio topk gives a general idea on whether the most relevant documents are appearing first in the search results. Decreasing `eval_topk` will make this metric stricter, requiring relevant documents to appear in a narrow window.
-
-Avg rank delta is another metric which can give insight on the performance of documents not in the topk search results. If none of the comparison documents are in the topk, `ratio_topk` will only show a 0, whereas `avg_rank_delta` will show a higher value the worse the search results gets.
+Note: If you only care about search quality, you should run with the `-s` flag for a significantly faster evaluation. Furthermore, you should set `-r` to 1 if running with federated search enabled to avoid hitting rate limits.
 
-Furthermore, there are two versions of the metrics: ground truth, and soft truth.
+6. After the run, an `eval-YYYY-MM-DD-HH-MM-SS` folder is created containing:
 
-The ground truth includes documents explicitly listed as relevant in the test dataset. The ground truth metrics will only be computed if a ground truth set is provided for the question and exists in the index.
+   * `test_queries.json`   – the dataset used with the list of valid queries and corresponding indexed ground truth.
+   * `search_results.json` – per-query search and answer details.
+   * `results_by_category.csv` – aggregated metrics per category and for "all".
+   * `search_position_chart.png` – bar-chart of ground-truth ranks.
 
-The soft truth is built on top of the ground truth (if provided), filling the remaining entries with results from the reranker. The soft truth metrics will only be computed if `skip_rerank` is false. Computing the soft truth metric can be extremely slow, especially for large `num_returned_hits`. However, it can provide a good basis when there are many relevant documents in no particular order, or for running quick tests without explicitly having to mention which documents are relevant.
\ No newline at end of file
+You can replace `test_queries.json` with the generated one for a slightly faster loading of the queries the next time around.
\ No newline at end of file
diff --git a/backend/tests/regression/search_quality/models.py b/backend/tests/regression/search_quality/models.py
new file mode 100644
index 00000000000..b8c00e003fa
--- /dev/null
+++ b/backend/tests/regression/search_quality/models.py
@@ -0,0 +1,82 @@
+from pydantic import BaseModel
+
+from onyx.configs.constants import DocumentSource
+from onyx.context.search.models import SavedSearchDoc
+
+
+class GroundTruth(BaseModel):
+    doc_source: DocumentSource
+    doc_link: str
+
+
+class TestQuery(BaseModel):
+    question: str
+    ground_truth: list[GroundTruth] = []
+    ground_truth_response: str | None = None
+    categories: list[str] = []
+
+    # autogenerated
+    ground_truth_docids: list[str] = []
+
+
+class EvalConfig(BaseModel):
+    max_search_results: int
+    max_answer_context: int
+    num_workers: int  # 0 = unlimited
+    max_request_rate: int  # 0 = unlimited
+    request_timeout: int
+    api_url: str
+    search_only: bool
+
+
+class OneshotQAResult(BaseModel):
+    time_taken: float
+    top_documents: list[SavedSearchDoc]
+    answer: str | None
+
+
+class RetrievedDocument(BaseModel):
+    document_id: str
+    chunk_id: int
+    content: str
+
+
+class AnalysisSummary(BaseModel):
+    question: str
+    categories: list[str]
+    found: bool
+    rank: int | None
+    total_results: int
+    ground_truth_count: int
+    response_relevancy: float | None = None
+    faithfulness: float | None = None
+    factual_correctness: float | None = None
+    answer: str | None = None
+    retrieved: list[RetrievedDocument] = []
+    time_taken: float
+
+
+class SearchMetrics(BaseModel):
+    total_queries: int
+    found_count: int
+
+    # for found results
+    best_rank: int
+    worst_rank: int
+    average_rank: float
+    top_k_accuracy: dict[int, float]
+
+
+class AnswerMetrics(BaseModel):
+    response_relevancy: float
+    faithfulness: float
+    factual_correctness: float
+
+    # only for metric computation
+    n_response_relevancy: int
+    n_faithfulness: int
+    n_factual_correctness: int
+
+
+class CombinedMetrics(SearchMetrics, AnswerMetrics):
+    average_time_taken: float
diff --git a/backend/tests/regression/search_quality/run_search_eval.py b/backend/tests/regression/search_quality/run_search_eval.py
index 1c3d03744d5..43dcd55474b 100644
--- a/backend/tests/regression/search_quality/run_search_eval.py
+++ b/backend/tests/regression/search_quality/run_search_eval.py
@@ -1,151 +1,725 @@
 import csv
+import json
+import os
+import sys
+import time
 from collections import defaultdict
+from concurrent.futures import as_completed
+from concurrent.futures import ThreadPoolExecutor
+from datetime import datetime
 from pathlib import Path
+from threading import Event
+from threading import Lock
+from threading import Semaphore
+from typing import cast
 
+import matplotlib.pyplot as plt  # type: ignore
+import requests
+from dotenv import load_dotenv
+from matplotlib.patches import Patch  # type: ignore
+from pydantic import ValidationError
+from requests.exceptions import RequestException
+from retry import retry
+
+# add onyx/backend to path (since this isn't done automatically when running as a script)
+current_dir = Path(__file__).parent
+onyx_dir = current_dir.parent.parent.parent.parent
+sys.path.append(str(onyx_dir / "backend"))
+
+# load env before app_config loads (since env doesn't get loaded when running as a script)
+env_path = onyx_dir / ".vscode" / ".env"
+if not env_path.exists():
+    raise RuntimeError(
+        "Could not find .env file. Please create one in the root .vscode directory."
+    )
+load_dotenv(env_path)
+
+# pylint: disable=E402
+# flake8: noqa: E402
+
+from ee.onyx.server.query_and_chat.models import OneShotQARequest
+from ee.onyx.server.query_and_chat.models import OneShotQAResponse
+from onyx.chat.models import ThreadMessage
 from onyx.configs.app_configs import POSTGRES_API_SERVER_POOL_OVERFLOW
 from onyx.configs.app_configs import POSTGRES_API_SERVER_POOL_SIZE
-from onyx.context.search.models import RerankingDetails
-from onyx.db.engine.sql_engine import get_session_with_current_tenant
+from onyx.configs.app_configs import AUTH_TYPE
+from onyx.configs.constants import AuthType
+from onyx.configs.constants import MessageType
+from onyx.context.search.enums import OptionalSearchSetting
+from onyx.context.search.models import IndexFilters
+from onyx.context.search.models import RetrievalDetails
+from onyx.db.engine.sql_engine import get_session_with_tenant
 from onyx.db.engine.sql_engine import SqlEngine
-from onyx.db.search_settings import get_current_search_settings
-from onyx.db.search_settings import get_multilingual_expansion
-from onyx.document_index.factory import get_default_document_index
 from onyx.utils.logger import setup_logger
 from shared_configs.configs import MULTI_TENANT
-from tests.regression.search_quality.util_config import load_config
-from tests.regression.search_quality.util_data import export_test_queries
-from tests.regression.search_quality.util_data import load_test_queries
-from tests.regression.search_quality.util_eval import evaluate_one_query
-from tests.regression.search_quality.util_eval import get_corresponding_document
-from tests.regression.search_quality.util_eval import metric_names
-from tests.regression.search_quality.util_retrieve import rerank_one_query
-from tests.regression.search_quality.util_retrieve import search_one_query
+from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA_STANDARD_VALUE
+from tests.regression.search_quality.models import AnalysisSummary
+from tests.regression.search_quality.models import CombinedMetrics
+from tests.regression.search_quality.models import EvalConfig
+from tests.regression.search_quality.models import OneshotQAResult
+from tests.regression.search_quality.models import TestQuery
+from tests.regression.search_quality.utils import compute_overall_scores
+from tests.regression.search_quality.utils import find_document_id
+from tests.regression.search_quality.utils import get_federated_sources
+from tests.regression.search_quality.utils import LazyJsonWriter
+from tests.regression.search_quality.utils import ragas_evaluate
+from tests.regression.search_quality.utils import search_docs_to_doc_contexts
 
 logger = setup_logger(__name__)
 
+GENERAL_HEADERS = {"Content-Type": "application/json"}
+TOP_K_LIST = [1, 3, 5, 10]
 
-def run_search_eval() -> None:
-    config = load_config()
-    test_queries = load_test_queries()
 
-    # export related
-    export_path = Path(config.export_folder)
-    export_test_queries(test_queries, export_path / "test_queries.json")
-    search_result_path = export_path / "search_results.csv"
-    eval_path = export_path / "eval_results.csv"
-    aggregate_eval_path = export_path / "aggregate_eval.csv"
-    aggregate_results: dict[str, list[list[float]]] = defaultdict(
-        lambda: [[] for _ in metric_names]
-    )
+class SearchAnswerAnalyzer:
+    def __init__(
+        self,
+        config: EvalConfig,
+        tenant_id: str | None = None,
+    ):
+        if not MULTI_TENANT:
+            logger.info("Running in single-tenant mode")
+            tenant_id = POSTGRES_DEFAULT_SCHEMA_STANDARD_VALUE
+        elif tenant_id is None:
+            raise ValueError("Tenant ID is required for multi-tenant")
+
+        self.config = config
+        self.tenant_id = tenant_id
 
-    with get_session_with_current_tenant() as db_session:
-        multilingual_expansion = get_multilingual_expansion(db_session)
-        search_settings = get_current_search_settings(db_session)
-        document_index = get_default_document_index(search_settings, None)
-        rerank_settings = RerankingDetails.from_db_model(search_settings)
-
-        if config.skip_rerank:
-            logger.warning("Reranking is disabled, evaluation will not run")
-        elif rerank_settings.rerank_model_name is None:
-            raise ValueError(
-                "Reranking is enabled but no reranker is configured. "
-                "Please set the reranker in the admin panel search settings."
+        # shared analysis results
+        self._lock = Lock()
+        self._progress_counter = 0
+        self._result_writer: LazyJsonWriter | None = None
+        self.ranks: list[int | None] = []
+        self.metrics: dict[str, CombinedMetrics] = defaultdict(
+            lambda: CombinedMetrics(
+                total_queries=0,
+                found_count=0,
+                best_rank=config.max_search_results,
+                worst_rank=1,
+                average_rank=0.0,
+                top_k_accuracy={k: 0.0 for k in TOP_K_LIST},
+                response_relevancy=0.0,
+                faithfulness=0.0,
+                factual_correctness=0.0,
+                n_response_relevancy=0,
+                n_faithfulness=0,
+                n_factual_correctness=0,
+                average_time_taken=0.0,
             )
+        )
+
+    def run_analysis(self, dataset_path: Path, export_path: Path) -> None:
+        # load and save the dataset
+        dataset = self._load_dataset(dataset_path)
+        dataset_size = len(dataset)
+        dataset_export_path = export_path / "test_queries.json"
+        with dataset_export_path.open("w") as f:
+            dataset_serializable = [q.model_dump(mode="json") for q in dataset]
+            json.dump(dataset_serializable, f, indent=4)
 
-        # run search and evaluate
-        logger.info(
-            "Running search and evaluation... "
-            f"Individual search and evaluation results will be saved to {search_result_path} and {eval_path}"
+        result_export_path = export_path / "search_results.json"
+        self._result_writer = LazyJsonWriter(result_export_path)
+
+        # set up rate limiting and threading primitives
+        interval = (
+            60.0 / self.config.max_request_rate
+            if self.config.max_request_rate > 0
+            else 0.0
         )
-        with (
-            search_result_path.open("w") as search_file,
-            eval_path.open("w") as eval_file,
-        ):
-            search_csv_writer = csv.writer(search_file)
-            eval_csv_writer = csv.writer(eval_file)
-            search_csv_writer.writerow(
-                ["source", "query", "rank", "score", "doc_id", "chunk_id"]
-            )
-            eval_csv_writer.writerow(["query", *metric_names])
-
-            for query in test_queries:
-                # search and write results
-                assert query.question_search is not None
-                search_chunks = search_one_query(
-                    query.question_search,
-                    multilingual_expansion,
-                    document_index,
-                    db_session,
-                    config,
-                )
-                for rank, result in enumerate(search_chunks):
-                    search_csv_writer.writerow(
+        available_workers = Semaphore(self.config.num_workers)
+        stop_event = Event()
+
+        def _submit_wrapper(tc: TestQuery) -> AnalysisSummary:
+            try:
+                return self._run_and_analyze_one(tc, dataset_size)
+            except Exception as e:
+                logger.error("Error during analysis: %s", e)
+                stop_event.set()
+                raise
+            finally:
+                available_workers.release()
+
+        # run the analysis
+        logger.info("Starting analysis of %d queries", dataset_size)
+        logger.info("Using %d parallel workers", self.config.num_workers)
+        logger.info("Exporting search results to %s", result_export_path)
+
+        with ThreadPoolExecutor(
+            max_workers=self.config.num_workers or None
+        ) as executor:
+            # submit requests at configured rate, break early if any error occurs
+            futures = []
+            for tc in dataset:
+                if stop_event.is_set():
+                    break
+
+                available_workers.acquire()
+                fut = executor.submit(_submit_wrapper, tc)
+                futures.append(fut)
+
+                if (
+                    len(futures) != dataset_size
+                    and interval > 0
+                    and not stop_event.is_set()
+                ):
+                    time.sleep(interval)
+
+            # ensure all tasks finish and surface any exceptions
+            for fut in as_completed(futures):
+                fut.result()
+
+        if self._result_writer:
+            self._result_writer.close()
+        self._aggregate_metrics()
+
+    def generate_detailed_report(self, export_path: Path) -> None:
+        logger.info("Generating detailed report...")
+
+        csv_path = export_path / "results_by_category.csv"
+        with csv_path.open("w", newline="") as csv_file:
+            csv_writer = csv.writer(csv_file)
+            csv_writer.writerow(
+                [
+                    "category",
+                    "total_queries",
+                    "found",
+                    "percent_found",
+                    "best_rank",
+                    "worst_rank",
+                    "avg_rank",
+                    *[f"top_{k}_accuracy" for k in TOP_K_LIST],
+                    *(
                         [
-                            "search",
-                            query.question_search,
-                            rank,
-                            result.score,
-                            result.document_id,
-                            result.chunk_id,
+                            "avg_response_relevancy",
+                            "avg_faithfulness",
+                            "avg_factual_correctness",
                         ]
-                    )
+                        if not self.config.search_only
+                        else []
+                    ),
+                    "search_score",
+                    *(["answer_score"] if not self.config.search_only else []),
+                    "avg_time_taken",
+                ]
+            )
+
+            for category, metrics in sorted(
+                self.metrics.items(), key=lambda c: (0 if c[0] == "all" else 1, c[0])
+            ):
+                found_count = metrics.found_count
+                total_count = metrics.total_queries
+                accuracy = found_count / total_count * 100 if total_count > 0 else 0
 
-                rerank_chunks = []
-                if not config.skip_rerank:
-                    # rerank and write results
-                    rerank_chunks = rerank_one_query(
-                        query.question, search_chunks, rerank_settings
+                print(
+                    f"\n{category.upper()}:"
+                    f"  total queries: {total_count}\n"
+                    f"  found: {found_count} ({accuracy:.1f}%)"
+                )
+                best_rank = metrics.best_rank if metrics.found_count > 0 else None
+                worst_rank = metrics.worst_rank if metrics.found_count > 0 else None
+                avg_rank = metrics.average_rank if metrics.found_count > 0 else None
+                if metrics.found_count > 0:
+                    print(
+                        f"  average rank (for found results): {avg_rank:.2f}\n"
+                        f"  best rank (for found results): {best_rank:.2f}\n"
+                        f"  worst rank (for found results): {worst_rank:.2f}"
                     )
-                    for rank, result in enumerate(rerank_chunks):
-                        search_csv_writer.writerow(
-                            [
-                                "rerank",
-                                query.question,
-                                rank,
-                                result.score,
-                                result.document_id,
-                                result.chunk_id,
-                            ]
+                    for k, acc in metrics.top_k_accuracy.items():
+                        print(f"  top-{k} accuracy: {acc:.1f}%")
+                if not self.config.search_only:
+                    if metrics.n_response_relevancy > 0:
+                        print(
+                            f"  average response relevancy: {metrics.response_relevancy:.2f}"
                         )
+                    if metrics.n_faithfulness > 0:
+                        print(f"  average faithfulness: {metrics.faithfulness:.2f}")
+                    if metrics.n_factual_correctness > 0:
+                        print(
+                            f"  average factual correctness: {metrics.factual_correctness:.2f}"
+                        )
+                search_score, answer_score = compute_overall_scores(metrics)
+                print(f"  search score: {search_score:.1f}")
+                if not self.config.search_only:
+                    print(f"  answer score: {answer_score:.1f}")
+                print(f"  average time taken: {metrics.average_time_taken:.2f}s")
 
-                # evaluate and write results
-                truth_documents = [
-                    doc
-                    for truth in query.ground_truth
-                    if (doc := get_corresponding_document(truth.doc_link, db_session))
-                ]
-                metrics = evaluate_one_query(
-                    search_chunks, rerank_chunks, truth_documents, config.eval_topk
-                )
-                metric_vals = [getattr(metrics, field) for field in metric_names]
-                eval_csv_writer.writerow([query.question, *metric_vals])
-
-                # add to aggregation
-                for category in ["all"] + query.categories:
-                    for i, val in enumerate(metric_vals):
-                        if val is not None:
-                            aggregate_results[category][i].append(val)
-
-        # aggregate and write results
-        with aggregate_eval_path.open("w") as file:
-            aggregate_csv_writer = csv.writer(file)
-            aggregate_csv_writer.writerow(["category", *metric_names])
-
-            for category, agg_metrics in aggregate_results.items():
-                aggregate_csv_writer.writerow(
+                csv_writer.writerow(
                     [
                         category,
+                        total_count,
+                        found_count,
+                        f"{accuracy:.1f}",
+                        best_rank or "",
+                        worst_rank or "",
+                        f"{avg_rank:.2f}" if avg_rank is not None else "",
+                        *[f"{acc:.1f}" for acc in metrics.top_k_accuracy.values()],
+                        *(
+                            [
+                                (
+                                    f"{metrics.response_relevancy:.2f}"
+                                    if metrics.n_response_relevancy > 0
+                                    else ""
+                                ),
+                                (
+                                    f"{metrics.faithfulness:.2f}"
+                                    if metrics.n_faithfulness > 0
+                                    else ""
+                                ),
+                                (
+                                    f"{metrics.factual_correctness:.2f}"
+                                    if metrics.n_factual_correctness > 0
+                                    else ""
+                                ),
+                            ]
+                            if not self.config.search_only
+                            else []
+                        ),
+                        f"{search_score:.1f}",
                         *(
-                            sum(metric) / len(metric) if metric else None
-                            for metric in agg_metrics
+                            [f"{answer_score:.1f}"]
+                            if not self.config.search_only
+                            else []
                         ),
+                        f"{metrics.average_time_taken:.2f}",
                     ]
                 )
+        logger.info("Saved category breakdown csv to %s", csv_path)
+
+    def generate_chart(self, export_path: Path) -> None:
+        logger.info("Generating search position chart...")
+
+        if len(self.ranks) == 0:
+            logger.warning("No results to chart")
+            return
+
+        found_count = 0
+        not_found_count = 0
+        rank_counts: dict[int, int] = defaultdict(int)
+        for rank in self.ranks:
+            if rank is None:
+                not_found_count += 1
+            else:
+                found_count += 1
+                rank_counts[rank] += 1
+
+        # create the data for plotting
+        if found_count:
+            max_rank = max(rank_counts.keys())
+            positions = list(range(1, max_rank + 1))
+            counts = [rank_counts.get(pos, 0) for pos in positions]
+        else:
+            positions = []
+            counts = []
+
+        # add the "not found" bar on the far right
+        if not_found_count:
+            # add some spacing between found positions and "not found"
+            not_found_position = (max(positions) + 2) if positions else 1
+            positions.append(not_found_position)
+            counts.append(not_found_count)
+
+            # create labels for x-axis
+            x_labels = [str(pos) for pos in positions[:-1]] + [
+                f"not found\n(>{self.config.max_search_results})"
+            ]
+        else:
+            x_labels = [str(pos) for pos in positions]
+
+        # create the figure and bar chart
+        plt.figure(figsize=(14, 6))
+
+        # use different colors for found vs not found
+        colors = (
+            ["#3498db"] * (len(positions) - 1) + ["#e74c3c"]
+            if not_found_count > 0
+            else ["#3498db"] * len(positions)
+        )
+        bars = plt.bar(
+            positions, counts, color=colors, alpha=0.7, edgecolor="black", linewidth=0.5
+        )
+
+        # customize the chart
+        plt.xlabel("Position in Search Results", fontsize=12)
+        plt.ylabel("Number of Ground Truth Documents", fontsize=12)
+        plt.title(
+            "Ground Truth Document Positions in Search Results",
+            fontsize=14,
+            fontweight="bold",
+        )
+        plt.grid(axis="y", alpha=0.3)
+
+        # add value labels on top of each bar
+        for bar, count in zip(bars, counts):
+            if count > 0:
+                plt.text(
+                    bar.get_x() + bar.get_width() / 2,
+                    bar.get_height() + 0.1,
+                    str(count),
+                    ha="center",
+                    va="bottom",
+                    fontweight="bold",
+                )
+
+        # set x-axis labels
+        plt.xticks(positions, x_labels, rotation=45 if not_found_count > 0 else 0)
+
+        # add legend if we have both found and not found
+        if not_found_count and found_count:
+            legend_elements = [
+                Patch(facecolor="#3498db", alpha=0.7, label="Found in Results"),
+                Patch(facecolor="#e74c3c", alpha=0.7, label="Not Found"),
+            ]
+            plt.legend(handles=legend_elements, loc="upper right")
+
+        # make layout tight and save
+        plt.tight_layout()
+        chart_file = export_path / "search_position_chart.png"
+        plt.savefig(chart_file, dpi=300, bbox_inches="tight")
+        logger.info("Search position chart saved to: %s", chart_file)
+        plt.show()
+
+    def _load_dataset(self, dataset_path: Path) -> list[TestQuery]:
+        """Load the test dataset from a JSON file and validate the ground truth documents."""
+        with dataset_path.open("r") as f:
+            dataset_raw: list[dict] = json.load(f)
+
+        with get_session_with_tenant(tenant_id=self.tenant_id) as db_session:
+            federated_sources = get_federated_sources(db_session)
+
+        dataset: list[TestQuery] = []
+        for datum in dataset_raw:
+            # validate the raw datum
+            try:
+                test_query = TestQuery(**datum)
+            except ValidationError as e:
+                logger.error("Incorrectly formatted query %s: %s", datum, e)
+                continue
+
+            # in case the dataset was copied from the previous run export
+            if test_query.ground_truth_docids:
+                dataset.append(test_query)
+                continue
+
+            # validate and get the ground truth documents
+            with get_session_with_tenant(tenant_id=self.tenant_id) as db_session:
+                for ground_truth in test_query.ground_truth:
+                    if (
+                        doc_id := find_document_id(
+                            ground_truth, federated_sources, db_session
+                        )
+                    ) is not None:
+                        test_query.ground_truth_docids.append(doc_id)
+
+            if len(test_query.ground_truth_docids) == 0:
+                logger.warning(
+                    "No ground truth documents found for query: %s, skipping...",
+                    test_query.question,
+                )
+                continue
+
+            dataset.append(test_query)
+
+        return dataset
+
+    @retry(tries=3, delay=1, backoff=2)
+    def _perform_oneshot_qa(self, query: str) -> OneshotQAResult:
+        """Perform a OneShot QA query against the Onyx API and time it."""
+        # create the OneShot QA request
+        messages = [ThreadMessage(message=query, sender=None, role=MessageType.USER)]
+        filters = IndexFilters(access_control_list=None, tenant_id=self.tenant_id)
+        qa_request = OneShotQARequest(
+            messages=messages,
+            persona_id=0,  # default persona
+            retrieval_options=RetrievalDetails(
+                run_search=OptionalSearchSetting.ALWAYS,
+                real_time=True,
+                filters=filters,
+                enable_auto_detect_filters=False,
+                limit=self.config.max_search_results,
+            ),
+            return_contexts=True,
+            skip_gen_ai_answer_generation=self.config.search_only,
+        )
+
+        # send the request
+        response = None
+        try:
+            request_data = qa_request.model_dump()
+            headers = GENERAL_HEADERS.copy()
+            if AUTH_TYPE != AuthType.DISABLED:
+                headers["Authorization"] = f"Bearer {os.environ.get('ONYX_API_KEY')}"
+
+            start_time = time.monotonic()
+            response = requests.post(
+                url=f"{self.config.api_url}/query/answer-with-citation",
+                json=request_data,
+                headers=headers,
+                timeout=self.config.request_timeout,
+            )
+            time_taken = time.monotonic() - start_time
+            response.raise_for_status()
+            result = OneShotQAResponse.model_validate(response.json())
+
+            # extract documents from the QA response
+            if result.docs:
+                top_documents = result.docs.top_documents
+                return OneshotQAResult(
+                    time_taken=time_taken,
+                    top_documents=top_documents,
+                    answer=result.answer,
+                )
+        except RequestException as e:
+            raise RuntimeError(
+                f"OneShot QA failed for query '{query}': {e}."
+                f" Response: {response.json()}"
+                if response
+                else ""
+            )
+        raise RuntimeError(f"OneShot QA returned no documents for query {query}")
+
+    def _run_and_analyze_one(self, test_case: TestQuery, total: int) -> AnalysisSummary:
+        result = self._perform_oneshot_qa(test_case.question)
+
+        # compute rank
+        rank = None
+        found = False
+        ground_truths = set(test_case.ground_truth_docids)
+        for i, doc in enumerate(result.top_documents, 1):
+            if doc.document_id in ground_truths:
+                rank = i
+                found = True
+                break
+
+        # print search progress and result
+        with self._lock:
+            self._progress_counter += 1
+            completed = self._progress_counter
+            status = "✓ Found" if found else "✗ Not found"
+            rank_info = f" (rank {rank})" if found else ""
+            question_snippet = (
+                test_case.question[:50] + "..."
+                if len(test_case.question) > 50
+                else test_case.question
+            )
+            print(f"[{completed}/{total}] {status}{rank_info}: {question_snippet}")
+
+        # get the search contents
+        retrieved = search_docs_to_doc_contexts(result.top_documents, self.tenant_id)
+
+        # do answer evaluation
+        response_relevancy: float | None = None
+        faithfulness: float | None = None
+        factual_correctness: float | None = None
+        contexts = [c.content for c in retrieved[: self.config.max_answer_context]]
+        if not self.config.search_only:
+            if result.answer is None:
+                logger.error(
+                    "No answer found for query: %s, skipping answer evaluation",
+                    test_case.question,
+                )
+            else:
+                try:
+                    ragas_result = ragas_evaluate(
+                        question=test_case.question,
+                        answer=result.answer,
+                        contexts=contexts,
+                        reference_answer=test_case.ground_truth_response,
+                    ).scores[0]
+                    response_relevancy = ragas_result["answer_relevancy"]
+                    faithfulness = ragas_result["faithfulness"]
+                    factual_correctness = ragas_result.get(
+                        "factual_correctness(mode=recall)"
+                    )
+                except Exception as e:
+                    logger.error(
+                        "Error evaluating answer for query %s: %s",
+                        test_case.question,
+                        e,
+                    )
+
+        # save results
+        analysis = AnalysisSummary(
+            question=test_case.question,
+            categories=test_case.categories,
+            found=found,
+            rank=rank,
+            total_results=len(result.top_documents),
+            ground_truth_count=len(test_case.ground_truth_docids),
+            answer=result.answer,
+            response_relevancy=response_relevancy,
+            faithfulness=faithfulness,
+            factual_correctness=factual_correctness,
+            retrieved=retrieved,
+            time_taken=result.time_taken,
+        )
+        with self._lock:
+            self.ranks.append(analysis.rank)
+            if self._result_writer:
+                self._result_writer.append(analysis.model_dump(mode="json"))
+            self._update_metrics(analysis)
+
+        return analysis
+
+    def _update_metrics(self, result: AnalysisSummary) -> None:
+        for cat in result.categories + ["all"]:
+            self.metrics[cat].total_queries += 1
+            self.metrics[cat].average_time_taken += result.time_taken
+
+            if result.found:
+                self.metrics[cat].found_count += 1
+
+                rank = cast(int, result.rank)
+                self.metrics[cat].best_rank = min(self.metrics[cat].best_rank, rank)
+                self.metrics[cat].worst_rank = max(self.metrics[cat].worst_rank, rank)
+                self.metrics[cat].average_rank += rank
+                for k in TOP_K_LIST:
+                    self.metrics[cat].top_k_accuracy[k] += int(rank <= k)
+
+            if self.config.search_only:
+                continue
+            if result.response_relevancy is not None:
+                self.metrics[cat].response_relevancy += result.response_relevancy
+                self.metrics[cat].n_response_relevancy += 1
+            if result.faithfulness is not None:
+                self.metrics[cat].faithfulness += result.faithfulness
+                self.metrics[cat].n_faithfulness += 1
+            if result.factual_correctness is not None:
+                self.metrics[cat].factual_correctness += result.factual_correctness
+                self.metrics[cat].n_factual_correctness += 1
+
+    def _aggregate_metrics(self) -> None:
+        for cat in self.metrics:
+            total = self.metrics[cat].total_queries
+            self.metrics[cat].average_time_taken /= total
+
+            if self.metrics[cat].found_count > 0:
+                self.metrics[cat].average_rank /= self.metrics[cat].found_count
+            for k in TOP_K_LIST:
+                self.metrics[cat].top_k_accuracy[k] /= total
+                self.metrics[cat].top_k_accuracy[k] *= 100
+
+            if self.config.search_only:
+                continue
+            if (n := self.metrics[cat].n_response_relevancy) > 0:
+                self.metrics[cat].response_relevancy /= n
+            if (n := self.metrics[cat].n_faithfulness) > 0:
+                self.metrics[cat].faithfulness /= n
+            if (n := self.metrics[cat].n_factual_correctness) > 0:
+                self.metrics[cat].factual_correctness /= n
+
+
+def run_search_eval(
+    dataset_path: Path,
+    config: EvalConfig,
+    tenant_id: str | None,
+) -> None:
+    # check openai api key is set if doing answer eval (must be called that for ragas to recognize)
+    if not config.search_only and not os.environ.get("OPENAI_API_KEY"):
+        raise RuntimeError(
+            "OPENAI_API_KEY is required for answer evaluation. "
+            "Please add it to the root .vscode/.env file."
+        )
+
+    # check onyx api key is set if auth is enabled
+    if AUTH_TYPE != AuthType.DISABLED and not os.environ.get("ONYX_API_KEY"):
+        raise RuntimeError(
+            "ONYX_API_KEY is required if auth is enabled. "
+            "Please create one in the admin panel and add it to the root .vscode/.env file."
+        )
+
+    # check onyx is running
+    try:
+        response = requests.get(
+            f"{config.api_url}/health", timeout=config.request_timeout
+        )
+        response.raise_for_status()
+    except RequestException as e:
+        raise RuntimeError(f"Could not connect to Onyx API: {e}")
+
+    # create the export folder
+    export_folder = current_dir / datetime.now().strftime("eval-%Y-%m-%d-%H-%M-%S")
+    export_path = Path(export_folder)
+    export_path.mkdir(parents=True, exist_ok=True)
+    logger.info("Created export folder: %s", export_path)
+
+    # run the search eval
+    analyzer = SearchAnswerAnalyzer(config=config, tenant_id=tenant_id)
+    analyzer.run_analysis(dataset_path, export_path)
+    analyzer.generate_detailed_report(export_path)
+    analyzer.generate_chart(export_path)
 
 
 if __name__ == "__main__":
-    if MULTI_TENANT:
-        raise ValueError("Multi-tenant is not supported currently")
+    import argparse
+
+    current_dir = Path(__file__).parent
+    parser = argparse.ArgumentParser(description="Run search quality evaluation.")
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        type=Path,
+        default=current_dir / "test_queries.json",
+        help="Path to the test-set JSON file (default: %(default)s).",
+    )
+    parser.add_argument(
+        "-n",
+        "--num_search",
+        type=int,
+        default=50,
+        help="Maximum number of documents to retrieve per search (default: %(default)s).",
+    )
+    parser.add_argument(
+        "-a",
+        "--num_answer",
+        type=int,
+        default=25,
+        help="Maximum number of documents to use for answer evaluation (default: %(default)s).",
+    )
+    parser.add_argument(
+        "-w",
+        "--max_workers",
+        type=int,
+        default=10,
+        help="Maximum number of concurrent search requests (0 = unlimited, default: %(default)s).",
+    )
+    parser.add_argument(
+        "-r",
+        "--max_req_rate",
+        type=int,
+        default=0,
+        help="Maximum number of search requests per minute (0 = unlimited, default: %(default)s).",
+    )
+    parser.add_argument(
+        "-q",
+        "--timeout",
+        type=int,
+        default=120,
+        help="Request timeout in seconds (default: %(default)s).",
+    )
+    parser.add_argument(
+        "-e",
+        "--api_endpoint",
+        type=str,
+        default="http://127.0.0.1:8080",
+        help="Base URL of the Onyx API server (default: %(default)s).",
+    )
+    parser.add_argument(
+        "-s",
+        "--search_only",
+        action="store_true",
+        default=False,
+        help="Only perform search and not answer evaluation (default: %(default)s).",
+    )
+    parser.add_argument(
+        "-t",
+        "--tenant_id",
+        type=str,
+        default=None,
+        help="Tenant ID to use for the evaluation (default: %(default)s).",
+    )
+
+    args = parser.parse_args()
 
     SqlEngine.init_engine(
         pool_size=POSTGRES_API_SERVER_POOL_SIZE,
@@ -153,9 +727,21 @@ def run_search_eval() -> None:
     )
 
     try:
-        run_search_eval()
+        run_search_eval(
+            args.dataset,
+            EvalConfig(
+                max_search_results=args.num_search,
+                max_answer_context=args.num_answer,
+                num_workers=args.max_workers,
+                max_request_rate=args.max_req_rate,
+                request_timeout=args.timeout,
+                api_url=args.api_endpoint,
+                search_only=args.search_only,
+            ),
+            args.tenant_id,
+        )
     except Exception as e:
-        logger.error(f"Error running search evaluation: {e}")
-        raise e
+        logger.error("Unexpected error during search evaluation: %s", e)
+        raise
     finally:
         SqlEngine.reset_engine()
diff --git a/backend/tests/regression/search_quality/search_eval_config.yaml.template b/backend/tests/regression/search_quality/search_eval_config.yaml.template
deleted file mode 100644
index 68405ebb116..00000000000
--- a/backend/tests/regression/search_quality/search_eval_config.yaml.template
+++ /dev/null
@@ -1,16 +0,0 @@
-# Search Parameters
-HYBRID_ALPHA: 0.5
-HYBRID_ALPHA_KEYWORD: 0.4
-DOC_TIME_DECAY: 0.5
-NUM_RETURNED_HITS: 50       # Setting to a higher value will improve evaluation quality but increase reranking time
-RANK_PROFILE: 'semantic'
-OFFSET: 0
-TITLE_CONTENT_RATIO: 0.1
-USER_EMAIL: null            # User email to use for testing, modifies access control list, null means only public files
-
-# Evaluation parameters
-SKIP_RERANK: false          # Whether to skip reranking, reranking must be enabled to evaluate the search results
-EVAL_TOPK: 5                # Number of top results from the searcher and reranker to evaluate, lower means stricter evaluation
-
-# Export file, will export a csv file with the results and a json file with the parameters
-EXPORT_FOLDER: "eval-%Y-%m-%d-%H-%M-%S"
diff --git a/backend/tests/regression/search_quality/test_queries.json.template b/backend/tests/regression/search_quality/test_queries.json.template
index 93e855472b7..e5646d3f03b 100644
--- a/backend/tests/regression/search_quality/test_queries.json.template
+++ b/backend/tests/regression/search_quality/test_queries.json.template
@@ -3,20 +3,18 @@
         "question": "What is Onyx?",
         "ground_truth": [
             {
-                "doc_source": "Web",
+                "doc_source": "web",
                 "doc_link": "https://docs.onyx.app/more/use_cases/overview"
             },
             {
-                "doc_source": "Web",
+                "doc_source": "web",
                 "doc_link": "https://docs.onyx.app/more/use_cases/ai_platform"
             }
         ],
         "categories": [
             "keyword",
-            "broad"
+            "broad",
+            "easy"
         ]
-    },
-    {
-        "question": "What is the meaning of life?"
     }
 ]
\ No newline at end of file
diff --git a/backend/tests/regression/search_quality/util_config.py b/backend/tests/regression/search_quality/util_config.py
deleted file mode 100644
index 4a06b7b9ec5..00000000000
--- a/backend/tests/regression/search_quality/util_config.py
+++ /dev/null
@@ -1,75 +0,0 @@
-from datetime import datetime
-from pathlib import Path
-
-import yaml
-from pydantic import BaseModel
-
-from onyx.agents.agent_search.shared_graph_utils.models import QueryExpansionType
-from onyx.configs.chat_configs import DOC_TIME_DECAY
-from onyx.configs.chat_configs import HYBRID_ALPHA
-from onyx.configs.chat_configs import HYBRID_ALPHA_KEYWORD
-from onyx.configs.chat_configs import NUM_RETURNED_HITS
-from onyx.configs.chat_configs import TITLE_CONTENT_RATIO
-from onyx.utils.logger import setup_logger
-
-logger = setup_logger(__name__)
-
-
-class SearchEvalConfig(BaseModel):
-    hybrid_alpha: float
-    hybrid_alpha_keyword: float
-    doc_time_decay: float
-    num_returned_hits: int
-    rank_profile: QueryExpansionType
-    offset: int
-    title_content_ratio: float
-    user_email: str | None
-    skip_rerank: bool
-    eval_topk: int
-    export_folder: str
-
-
-def load_config() -> SearchEvalConfig:
-    """Loads the search evaluation configs from the config file."""
-    # open the config file
-    current_dir = Path(__file__).parent
-    config_path = current_dir / "search_eval_config.yaml"
-    if not config_path.exists():
-        raise FileNotFoundError(f"Search eval config file not found at {config_path}")
-    with config_path.open("r") as file:
-        config_raw = yaml.safe_load(file)
-
-    # create the export folder
-    export_folder = config_raw.get("EXPORT_FOLDER", "eval-%Y-%m-%d-%H-%M-%S")
-    export_folder = datetime.now().strftime(export_folder)
-    export_path = Path(export_folder)
-    export_path.mkdir(parents=True, exist_ok=True)
-    logger.info(f"Created export folder: {export_path}")
-
-    # create the config
-    config = SearchEvalConfig(
-        hybrid_alpha=config_raw.get("HYBRID_ALPHA", HYBRID_ALPHA),
-        hybrid_alpha_keyword=config_raw.get(
-            "HYBRID_ALPHA_KEYWORD", HYBRID_ALPHA_KEYWORD
-        ),
-        doc_time_decay=config_raw.get("DOC_TIME_DECAY", DOC_TIME_DECAY),
-        num_returned_hits=config_raw.get("NUM_RETURNED_HITS", NUM_RETURNED_HITS),
-        rank_profile=config_raw.get("RANK_PROFILE", QueryExpansionType.SEMANTIC),
-        offset=config_raw.get("OFFSET", 0),
-        title_content_ratio=config_raw.get("TITLE_CONTENT_RATIO", TITLE_CONTENT_RATIO),
-        user_email=config_raw.get("USER_EMAIL"),
-        skip_rerank=config_raw.get("SKIP_RERANK", False),
-        eval_topk=config_raw.get("EVAL_TOPK", 5),
-        export_folder=export_folder,
-    )
-    logger.info(f"Using search parameters: {config}")
-
-    # export the config
-    config_file = export_path / "search_eval_config.yaml"
-    with config_file.open("w") as file:
-        config_dict = config.model_dump(mode="python")
-        config_dict["rank_profile"] = config.rank_profile.value
-        yaml.dump(config_dict, file, sort_keys=False)
-    logger.info(f"Exported config to {config_file}")
-
-    return config
diff --git a/backend/tests/regression/search_quality/util_data.py b/backend/tests/regression/search_quality/util_data.py
deleted file mode 100644
index 34f0c5515eb..00000000000
--- a/backend/tests/regression/search_quality/util_data.py
+++ /dev/null
@@ -1,166 +0,0 @@
-import json
-from pathlib import Path
-from typing import cast
-from typing import Optional
-
-from langgraph.types import StreamWriter
-from pydantic import BaseModel
-from pydantic import ValidationError
-
-from onyx.agents.agent_search.basic.utils import process_llm_stream
-from onyx.chat.models import PromptConfig
-from onyx.chat.prompt_builder.answer_prompt_builder import AnswerPromptBuilder
-from onyx.chat.prompt_builder.answer_prompt_builder import default_build_system_message
-from onyx.chat.prompt_builder.answer_prompt_builder import default_build_user_message
-from onyx.configs.constants import DEFAULT_PERSONA_ID
-from onyx.db.engine.sql_engine import get_session_with_current_tenant
-from onyx.db.persona import get_persona_by_id
-from onyx.llm.factory import get_llms_for_persona
-from onyx.llm.interfaces import LLM
-from onyx.tools.tool_implementations.search.search_tool import SearchTool
-from onyx.tools.utils import explicit_tool_calling_supported
-from onyx.utils.logger import setup_logger
-
-logger = setup_logger()
-
-
-class GroundTruth(BaseModel):
-    doc_source: str
-    doc_link: str
-
-
-class TestQuery(BaseModel):
-    question: str
-    question_search: Optional[str] = None
-    ground_truth: list[GroundTruth] = []
-    categories: list[str] = []
-
-
-def load_test_queries() -> list[TestQuery]:
-    """
-    Loads the test queries from the test_queries.json file.
-    If `question_search` is missing, it will use the tool-calling LLM to generate it.
-    """
-    # open test queries file
-    current_dir = Path(__file__).parent
-    test_queries_path = current_dir / "test_queries.json"
-    logger.info(f"Loading test queries from {test_queries_path}")
-    if not test_queries_path.exists():
-        raise FileNotFoundError(f"Test queries file not found at {test_queries_path}")
-    with test_queries_path.open("r") as f:
-        test_queries_raw: list[dict] = json.load(f)
-
-    # setup llm for question_search generation
-    with get_session_with_current_tenant() as db_session:
-        persona = get_persona_by_id(DEFAULT_PERSONA_ID, None, db_session)
-        llm, _ = get_llms_for_persona(persona)
-        prompt_config = PromptConfig.from_model(persona.prompts[0])
-        search_tool = SearchToolOverride()
-
-        tool_call_supported = explicit_tool_calling_supported(
-            llm.config.model_provider, llm.config.model_name
-        )
-
-    # validate keys and generate question_search if missing
-    test_queries: list[TestQuery] = []
-    for query_raw in test_queries_raw:
-        try:
-            test_query = TestQuery(**query_raw)
-        except ValidationError as e:
-            logger.error(f"Incorrectly formatted query: {e}")
-            continue
-
-        if test_query.question_search is None:
-            test_query.question_search = _modify_one_query(
-                query=test_query.question,
-                llm=llm,
-                prompt_config=prompt_config,
-                tool=search_tool,
-                tool_call_supported=tool_call_supported,
-            )
-        test_queries.append(test_query)
-
-    return test_queries
-
-
-def export_test_queries(test_queries: list[TestQuery], export_path: Path) -> None:
-    """Exports the test queries to a JSON file."""
-    logger.info(f"Exporting test queries to {export_path}")
-    with export_path.open("w") as f:
-        json.dump(
-            [query.model_dump() for query in test_queries],
-            f,
-            indent=4,
-        )
-
-
-class SearchToolOverride(SearchTool):
-    def __init__(self) -> None:
-        # do nothing, only class variables are required for the functions we call
-        pass
-
-
-warned = False
-
-
-def _modify_one_query(
-    query: str,
-    llm: LLM,
-    prompt_config: PromptConfig,
-    tool: SearchTool,
-    tool_call_supported: bool,
-    writer: StreamWriter = lambda _: None,
-) -> str:
-    global warned
-    if not warned:
-        logger.warning(
-            "Generating question_search. If you do not save the question_search, "
-            "it will be generated again on the next run, potentially altering the search results."
-        )
-        warned = True
-
-    prompt_builder = AnswerPromptBuilder(
-        user_message=default_build_user_message(
-            user_query=query,
-            prompt_config=prompt_config,
-            files=[],
-            single_message_history=None,
-        ),
-        system_message=default_build_system_message(prompt_config, llm.config),
-        message_history=[],
-        llm_config=llm.config,
-        raw_user_query=query,
-        raw_user_uploaded_files=[],
-        single_message_history=None,
-    )
-
-    if tool_call_supported:
-        prompt = prompt_builder.build()
-        tool_definition = tool.tool_definition()
-        stream = llm.stream(
-            prompt=prompt,
-            tools=[tool_definition],
-            tool_choice="required",
-            structured_response_format=None,
-        )
-        tool_message = process_llm_stream(
-            messages=stream,
-            should_stream_answer=False,
-            writer=writer,
-        )
-        return (
-            tool_message.tool_calls[0]["args"]["query"]
-            if tool_message.tool_calls
-            else query
-        )
-
-    history = prompt_builder.get_message_history()
-    return cast(
-        dict[str, str],
-        tool.get_args_for_non_tool_calling_llm(
-            query=query,
-            history=history,
-            llm=llm,
-            force_run=True,
-        ),
-    )["query"]
diff --git a/backend/tests/regression/search_quality/util_eval.py b/backend/tests/regression/search_quality/util_eval.py
deleted file mode 100644
index 47fb86d7ee3..00000000000
--- a/backend/tests/regression/search_quality/util_eval.py
+++ /dev/null
@@ -1,94 +0,0 @@
-from typing import Optional
-
-from pydantic import BaseModel
-from sqlalchemy.orm import Session
-
-from onyx.context.search.models import InferenceChunk
-from onyx.db.models import Document
-from onyx.utils.logger import setup_logger
-from tests.regression.search_quality.util_retrieve import group_by_documents
-
-logger = setup_logger(__name__)
-
-
-class Metrics(BaseModel):
-    # computed if ground truth is provided
-    ground_truth_ratio_topk: Optional[float] = None
-    ground_truth_avg_rank_delta: Optional[float] = None
-
-    # computed if reranked results are provided
-    soft_truth_ratio_topk: Optional[float] = None
-    soft_truth_avg_rank_delta: Optional[float] = None
-
-
-metric_names = list(Metrics.model_fields.keys())
-
-
-def get_corresponding_document(
-    doc_link: str, db_session: Session
-) -> Optional[Document]:
-    """Get the corresponding document from the database."""
-    doc_filter = db_session.query(Document).filter(Document.link == doc_link)
-    count = doc_filter.count()
-    if count == 0:
-        logger.warning(f"Could not find document with link {doc_link}, ignoring")
-        return None
-    if count > 1:
-        logger.warning(f"Found multiple documents with link {doc_link}, using first")
-    return doc_filter.first()
-
-
-def evaluate_one_query(
-    search_chunks: list[InferenceChunk],
-    rerank_chunks: list[InferenceChunk],
-    true_documents: list[Document],
-    topk: int,
-) -> Metrics:
-    """Computes metrics for the search results, relative to the ground truth and reranked results."""
-    metrics_dict: dict[str, float] = {}
-
-    search_documents = group_by_documents(search_chunks)
-    search_ranks = {docid: rank for rank, docid in enumerate(search_documents)}
-    search_ranks_topk = {
-        docid: rank for rank, docid in enumerate(search_documents[:topk])
-    }
-    true_ranks = {doc.id: rank for rank, doc in enumerate(true_documents)}
-
-    if true_documents:
-        metrics_dict["ground_truth_ratio_topk"] = _compute_ratio(
-            search_ranks_topk, true_ranks
-        )
-        metrics_dict["ground_truth_avg_rank_delta"] = _compute_avg_rank_delta(
-            search_ranks, true_ranks
-        )
-
-    if rerank_chunks:
-        # build soft truth out of ground truth + reranked results, up to topk
-        soft_ranks = true_ranks
-        for docid in group_by_documents(rerank_chunks):
-            if len(soft_ranks) >= topk:
-                break
-            if docid not in soft_ranks:
-                soft_ranks[docid] = len(soft_ranks)
-
-        metrics_dict["soft_truth_ratio_topk"] = _compute_ratio(
-            search_ranks_topk, soft_ranks
-        )
-        metrics_dict["soft_truth_avg_rank_delta"] = _compute_avg_rank_delta(
-            search_ranks, soft_ranks
-        )
-
-    return Metrics(**metrics_dict)
-
-
-def _compute_ratio(search_ranks: dict[str, int], true_ranks: dict[str, int]) -> float:
-    return len(set(search_ranks) & set(true_ranks)) / len(true_ranks)
-
-
-def _compute_avg_rank_delta(
-    search_ranks: dict[str, int], true_ranks: dict[str, int]
-) -> float:
-    out = len(search_ranks)
-    return sum(
-        abs(search_ranks.get(docid, out) - rank) for docid, rank in true_ranks.items()
-    ) / len(true_ranks)
diff --git a/backend/tests/regression/search_quality/util_retrieve.py b/backend/tests/regression/search_quality/util_retrieve.py
deleted file mode 100644
index 5ddfa29471f..00000000000
--- a/backend/tests/regression/search_quality/util_retrieve.py
+++ /dev/null
@@ -1,88 +0,0 @@
-from sqlalchemy.orm import Session
-
-from onyx.context.search.models import IndexFilters
-from onyx.context.search.models import InferenceChunk
-from onyx.context.search.models import RerankingDetails
-from onyx.context.search.postprocessing.postprocessing import semantic_reranking
-from onyx.context.search.preprocessing.preprocessing import query_analysis
-from onyx.context.search.retrieval.search_runner import get_query_embedding
-from onyx.context.search.utils import remove_stop_words_and_punctuation
-from onyx.document_index.interfaces import DocumentIndex
-from onyx.utils.logger import setup_logger
-from tests.regression.search_quality.util_config import SearchEvalConfig
-
-logger = setup_logger(__name__)
-
-
-def search_one_query(
-    question_search: str,
-    multilingual_expansion: list[str],
-    document_index: DocumentIndex,
-    db_session: Session,
-    config: SearchEvalConfig,
-) -> list[InferenceChunk]:
-    """Uses the search pipeline to retrieve relevant chunks for the given query."""
-    # the retrieval preprocessing is fairly stripped down so the query doesn't unexpectedly change
-    query_embedding = get_query_embedding(question_search, db_session)
-
-    all_query_terms = question_search.split()
-    processed_keywords = (
-        remove_stop_words_and_punctuation(all_query_terms)
-        if not multilingual_expansion
-        else all_query_terms
-    )
-
-    is_keyword = query_analysis(question_search)[0]
-    hybrid_alpha = config.hybrid_alpha_keyword if is_keyword else config.hybrid_alpha
-
-    access_control_list = ["PUBLIC"]
-    if config.user_email:
-        access_control_list.append(f"user_email:{config.user_email}")
-    filters = IndexFilters(
-        tags=[],
-        user_file_ids=[],
-        user_folder_ids=[],
-        access_control_list=access_control_list,
-        tenant_id=None,
-    )
-
-    results = document_index.hybrid_retrieval(
-        query=question_search,
-        query_embedding=query_embedding,
-        final_keywords=processed_keywords,
-        filters=filters,
-        hybrid_alpha=hybrid_alpha,
-        time_decay_multiplier=config.doc_time_decay,
-        num_to_retrieve=config.num_returned_hits,
-        ranking_profile_type=config.rank_profile,
-        offset=config.offset,
-        title_content_ratio=config.title_content_ratio,
-    )
-
-    return [result.to_inference_chunk() for result in results]
-
-
-def rerank_one_query(
-    question: str,
-    retrieved_chunks: list[InferenceChunk],
-    rerank_settings: RerankingDetails,
-) -> list[InferenceChunk]:
-    """Uses the reranker to rerank the retrieved chunks for the given query."""
-    rerank_settings.num_rerank = len(retrieved_chunks)
-    return semantic_reranking(
-        query_str=question,
-        rerank_settings=rerank_settings,
-        chunks=retrieved_chunks,
-        rerank_metrics_callback=None,
-    )[0]
-
-
-def group_by_documents(chunks: list[InferenceChunk]) -> list[str]:
-    """Groups a sorted list of chunks into a sorted list of document ids."""
-    seen_docids: set[str] = set()
-    retrieved_docids: list[str] = []
-    for chunk in chunks:
-        if chunk.document_id not in seen_docids:
-            seen_docids.add(chunk.document_id)
-            retrieved_docids.append(chunk.document_id)
-    return retrieved_docids
diff --git a/backend/tests/regression/search_quality/utils.py b/backend/tests/regression/search_quality/utils.py
new file mode 100644
index 00000000000..dc5b6e53352
--- /dev/null
+++ b/backend/tests/regression/search_quality/utils.py
@@ -0,0 +1,208 @@
+import json
+import re
+from pathlib import Path
+from textwrap import indent
+from typing import Any
+from typing import TextIO
+
+from ragas import evaluate  # type: ignore
+from ragas import EvaluationDataset  # type: ignore
+from ragas import SingleTurnSample  # type: ignore
+from ragas.dataset_schema import EvaluationResult  # type: ignore
+from ragas.metrics import FactualCorrectness  # type: ignore
+from ragas.metrics import Faithfulness  # type: ignore
+from ragas.metrics import ResponseRelevancy  # type: ignore
+from sqlalchemy.orm import Session
+
+from onyx.configs.constants import DocumentSource
+from onyx.context.search.models import IndexFilters
+from onyx.context.search.models import SavedSearchDoc
+from onyx.db.engine.sql_engine import get_session_with_tenant
+from onyx.db.models import Document
+from onyx.db.models import FederatedConnector
+from onyx.db.search_settings import get_current_search_settings
+from onyx.document_index.factory import get_default_document_index
+from onyx.document_index.interfaces import VespaChunkRequest
+from onyx.prompts.prompt_utils import build_doc_context_str
+from onyx.utils.logger import setup_logger
+from tests.regression.search_quality.models import CombinedMetrics
+from tests.regression.search_quality.models import GroundTruth
+from tests.regression.search_quality.models import RetrievedDocument
+
+logger = setup_logger(__name__)
+
+
+def get_federated_sources(db_session: Session) -> set[DocumentSource]:
+    """Get all federated sources from the database."""
+    return {
+        source
+        for connector in db_session.query(FederatedConnector).all()
+        if (source := connector.source.to_non_federated_source()) is not None
+    }
+
+
+def find_document_id(
+    ground_truth: GroundTruth,
+    federated_sources: set[DocumentSource],
+    db_session: Session,
+) -> str | None:
+    """Find a document by its link and return its id if found."""
+    # handle federated sources TODO: maybe make handler dictionary by source if this gets complex
+    if ground_truth.doc_source in federated_sources:
+        if ground_truth.doc_source == DocumentSource.SLACK:
+            groups = re.search(
+                r"archives\/([A-Z0-9]+)\/p([0-9]+)", ground_truth.doc_link
+            )
+            if groups:
+                channel_id = groups.group(1)
+                message_id = groups.group(2)
+                return f"{channel_id}__{message_id[:-6]}.{message_id[-6:]}"
+
+    # preprocess links
+    doc_link = ground_truth.doc_link
+    if ground_truth.doc_source == DocumentSource.GOOGLE_DRIVE:
+        if "/edit" in doc_link:
+            doc_link = doc_link.split("/edit", 1)[0]
+        elif "/view" in doc_link:
+            doc_link = doc_link.split("/view", 1)[0]
+    elif ground_truth.doc_source == DocumentSource.FIREFLIES:
+        doc_link = doc_link.split("?", 1)[0]
+
+    docs = db_session.query(Document).filter(Document.link.ilike(f"{doc_link}%")).all()
+    if len(docs) == 0:
+        logger.warning("Could not find ground truth document: %s", doc_link)
+        return None
+    elif len(docs) > 1:
+        logger.warning(
+            "Found multiple ground truth documents: %s, using the first one: %s",
+            doc_link,
+            docs[0].id,
+        )
+    return docs[0].id
+
+
+def get_doc_contents(
+    docs: list[SavedSearchDoc], tenant_id: str
+) -> dict[tuple[str, int], str]:
+    with get_session_with_tenant(tenant_id=tenant_id) as db_session:
+        search_settings = get_current_search_settings(db_session)
+        document_index = get_default_document_index(search_settings, None)
+
+    filters = IndexFilters(access_control_list=None, tenant_id=tenant_id)
+
+    reqs: list[VespaChunkRequest] = [
+        VespaChunkRequest(
+            document_id=doc.document_id,
+            min_chunk_ind=doc.chunk_ind,
+            max_chunk_ind=doc.chunk_ind,
+        )
+        for doc in docs
+    ]
+
+    results = document_index.id_based_retrieval(chunk_requests=reqs, filters=filters)
+    return {(doc.document_id, doc.chunk_id): doc.content for doc in results}
+
+
+def search_docs_to_doc_contexts(
+    docs: list[SavedSearchDoc], tenant_id: str
+) -> list[RetrievedDocument]:
+    try:
+        doc_contents = get_doc_contents(docs, tenant_id)
+    except Exception as e:
+        logger.error("Error getting doc contents: %s", e)
+        doc_contents = {}
+
+    return [
+        RetrievedDocument(
+            document_id=doc.document_id,
+            chunk_id=doc.chunk_ind,
+            content=build_doc_context_str(
+                semantic_identifier=doc.semantic_identifier,
+                source_type=doc.source_type,
+                content=doc_contents.get(
+                    (doc.document_id, doc.chunk_ind), f"Blurb: {doc.blurb}"
+                ),
+                metadata_dict=doc.metadata,
+                updated_at=doc.updated_at,
+                ind=ind,
+                include_metadata=True,
+            ),
+        )
+        for ind, doc in enumerate(docs)
+    ]
+
+
+def ragas_evaluate(
+    question: str, answer: str, contexts: list[str], reference_answer: str | None = None
+) -> EvaluationResult:
+    sample = SingleTurnSample(
+        user_input=question,
+        retrieved_contexts=contexts,
+        response=answer,
+        reference=reference_answer,
+    )
+    dataset = EvaluationDataset([sample])
+    return evaluate(
+        dataset,
+        metrics=[
+            ResponseRelevancy(),
+            Faithfulness(),
+            *(
+                [FactualCorrectness(mode="recall")]
+                if reference_answer is not None
+                else []
+            ),
+        ],
+    )
+
+
+def compute_overall_scores(metrics: CombinedMetrics) -> tuple[float, float]:
+    """Compute the overall search and answer quality scores.
+    The scores are subjective and may require tuning."""
+    # search score
+    FOUND_RATIO_WEIGHT = 0.4
+    TOP_IMPORTANCE = 0.7  # 0-inf, how important is it to be no. 1 over other ranks
+
+    found_ratio = metrics.found_count / metrics.total_queries
+    sum_k = sum(1.0 / pow(k, TOP_IMPORTANCE) for k in metrics.top_k_accuracy)
+    weighted_topk = sum(
+        acc / (pow(k, TOP_IMPORTANCE) * sum_k * 100)
+        for k, acc in metrics.top_k_accuracy.items()
+    )
+    search_score = 100 * (
+        FOUND_RATIO_WEIGHT * found_ratio + (1.0 - FOUND_RATIO_WEIGHT) * weighted_topk
+    )
+
+    # answer score
+    mets = [
+        *([metrics.response_relevancy] if metrics.n_response_relevancy > 0 else []),
+        *([metrics.faithfulness] if metrics.n_faithfulness > 0 else []),
+        *([metrics.factual_correctness] if metrics.n_factual_correctness > 0 else []),
+    ]
+    answer_score = 100 * sum(mets) / len(mets) if mets else 0.0
+
+    return search_score, answer_score
+
+
+class LazyJsonWriter:
+    def __init__(self, filepath: Path, indent: int = 4) -> None:
+        self.filepath = filepath
+        self.file: TextIO | None = None
+        self.indent = indent
+
+    def append(self, serializable_item: dict[str, Any]) -> None:
+        if not self.file:
+            self.file = open(self.filepath, "a")
+            self.file.write("[\n")
+        else:
+            self.file.write(",\n")
+
+        data = json.dumps(serializable_item, indent=self.indent)
+        self.file.write(indent(data, " " * self.indent))
+
+    def close(self) -> None:
+        if not self.file:
+            return
+        self.file.write("\n]")
+        self.file.close()
+        self.file = None

From 44bee6fc4bf6dc5483018c2c15a962f0a4f995d3 Mon Sep 17 00:00:00 2001
From: Raunak Bhagat <r@rabh.io>
Date: Mon, 21 Jul 2025 12:45:48 -0700
Subject: [PATCH 13/78] Remove empty tooltip (#5050)

---
 web/src/app/admin/add-connector/page.tsx | 68 ++++++++++--------------
 web/src/components/SourceTile.tsx        | 50 +++++++++++++++++
 2 files changed, 79 insertions(+), 39 deletions(-)
 create mode 100644 web/src/components/SourceTile.tsx

diff --git a/web/src/app/admin/add-connector/page.tsx b/web/src/app/admin/add-connector/page.tsx
index 7abb8b29702..588d23cd42d 100644
--- a/web/src/app/admin/add-connector/page.tsx
+++ b/web/src/app/admin/add-connector/page.tsx
@@ -1,7 +1,6 @@
 "use client";
-import { SourceIcon } from "@/components/SourceIcon";
 import { AdminPageTitle } from "@/components/admin/Title";
-import { AlertIcon, ConnectorIcon, InfoIcon } from "@/components/icons/icons";
+import { ConnectorIcon } from "@/components/icons/icons";
 import { SourceCategory, SourceMetadata } from "@/lib/search/interfaces";
 import { listSourceMetadata } from "@/lib/sources";
 import Title from "@/components/ui/title";
@@ -31,9 +30,10 @@ import useSWR from "swr";
 import { errorHandlingFetcher } from "@/lib/fetcher";
 import { buildSimilarCredentialInfoURL } from "@/app/admin/connector/[ccPairId]/lib";
 import { Credential } from "@/lib/connectors/credentials";
+import SourceTile from "@/components/SourceTile";
 import { SettingsContext } from "@/components/settings/SettingsProvider";
 
-function SourceTile({
+function SourceTileTooltipWrapper({
   sourceMetadata,
   preSelect,
   federatedConnectors,
@@ -82,46 +82,36 @@ function SourceTile({
     sourceMetadata.adminUrl,
   ]);
 
+  // Compute whether to hide the tooltip based on the provided condition
+  const shouldHideTooltip =
+    !(existingFederatedConnector && !hasExistingSlackCredentials) &&
+    !hasExistingSlackCredentials &&
+    !sourceMetadata.federated;
+
+  // If tooltip should be hidden, just render the tile as a component
+  if (shouldHideTooltip) {
+    return (
+      <SourceTile
+        sourceMetadata={sourceMetadata}
+        preSelect={preSelect}
+        navigationUrl={navigationUrl}
+        hasExistingSlackCredentials={!!hasExistingSlackCredentials}
+      />
+    );
+  }
+
   return (
     <TooltipProvider>
       <Tooltip>
         <TooltipTrigger asChild>
-          <Link
-            className={`flex
-              flex-col
-              items-center
-              justify-center
-              p-4
-              rounded-lg
-              w-40
-              cursor-pointer
-              shadow-md
-              hover:bg-accent-background-hovered
-              relative
-              ${
-                preSelect
-                  ? "bg-accent-background-hovered subtle-pulse"
-                  : "bg-accent-background"
-              }
-            `}
-            href={navigationUrl}
-          >
-            {sourceMetadata.federated && !hasExistingSlackCredentials && (
-              <div className="absolute -top-2 -left-2 z-10 bg-white rounded-full p-1 shadow-md border border-orange-200">
-                <AlertIcon
-                  size={18}
-                  className="text-orange-500 font-bold stroke-2"
-                />
-              </div>
-            )}
-            <SourceIcon
-              sourceType={sourceMetadata.internalName}
-              iconSize={24}
+          <div>
+            <SourceTile
+              sourceMetadata={sourceMetadata}
+              preSelect={preSelect}
+              navigationUrl={navigationUrl}
+              hasExistingSlackCredentials={!!hasExistingSlackCredentials}
             />
-            <p className="font-medium text-sm mt-2">
-              {sourceMetadata.displayName}
-            </p>
-          </Link>
+          </div>
         </TooltipTrigger>
         <TooltipContent side="top" className="max-w-sm">
           {existingFederatedConnector && !hasExistingSlackCredentials ? (
@@ -280,7 +270,7 @@ export default function Page() {
             </div>
             <div className="flex flex-wrap gap-4 p-4">
               {sources.map((source, sourceInd) => (
-                <SourceTile
+                <SourceTileTooltipWrapper
                   preSelect={
                     searchTerm.length > 0 && categoryInd == 0 && sourceInd == 0
                   }
diff --git a/web/src/components/SourceTile.tsx b/web/src/components/SourceTile.tsx
new file mode 100644
index 00000000000..a6d47a199e4
--- /dev/null
+++ b/web/src/components/SourceTile.tsx
@@ -0,0 +1,50 @@
+import { SourceIcon } from "@/components/SourceIcon";
+import { AlertIcon } from "@/components/icons/icons";
+import Link from "next/link";
+import { SourceMetadata } from "@/lib/search/interfaces";
+import React from "react";
+
+interface SourceTileProps {
+  sourceMetadata: SourceMetadata;
+  preSelect?: boolean;
+  navigationUrl: string;
+  hasExistingSlackCredentials: boolean;
+}
+
+export default function SourceTile({
+  sourceMetadata,
+  preSelect,
+  navigationUrl,
+  hasExistingSlackCredentials,
+}: SourceTileProps) {
+  return (
+    <Link
+      className={`flex
+              flex-col
+              items-center
+              justify-center
+              p-4
+              rounded-lg
+              w-40
+              cursor-pointer
+              shadow-md
+              hover:bg-accent-background-hovered
+              relative
+              ${
+                preSelect
+                  ? "bg-accent-background-hovered subtle-pulse"
+                  : "bg-accent-background"
+              }
+            `}
+      href={navigationUrl}
+    >
+      {sourceMetadata.federated && !hasExistingSlackCredentials && (
+        <div className="absolute -top-2 -left-2 z-10 bg-white rounded-full p-1 shadow-md border border-orange-200">
+          <AlertIcon size={18} className="text-orange-500 font-bold stroke-2" />
+        </div>
+      )}
+      <SourceIcon sourceType={sourceMetadata.internalName} iconSize={24} />
+      <p className="font-medium text-sm mt-2">{sourceMetadata.displayName}</p>
+    </Link>
+  );
+}

From 48f8a68c78ee0a269e613c82624c63fe705d7b53 Mon Sep 17 00:00:00 2001
From: Raunak Bhagat <r@rabh.io>
Date: Mon, 21 Jul 2025 15:37:27 -0700
Subject: [PATCH 14/78] feat: Updated KG admin page (#5044)

* Update KG admin UI

* Styling changes

* More changes

* Make edits auto-save

* Add more stylings / transitions

* Fix opacity

* Separate out modal into new component

* Revert backend changes

* Update styling

* Add convenience / styling changes to date-picker

* More styling / functional updates to kg admin-page

* Avoid reducing opacity of active-toggle

* Update backend APIs for new KG admin page

* More updates of styling for kg-admin page

* Remove nullability

* Remove console log

* Remove unused imports

* Change type of `children` variable

* Update web/src/app/admin/kg/interfaces.ts

Co-authored-by: cubic-dev-ai[bot] <191113872+cubic-dev-ai[bot]@users.noreply.github.com>

* Update web/src/components/CollapsibleCard.tsx

Co-authored-by: cubic-dev-ai[bot] <191113872+cubic-dev-ai[bot]@users.noreply.github.com>

* Remove null

* Update web/src/components/CollapsibleCard.tsx

Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>

* Force non-null

* Fix failing test

---------

Co-authored-by: cubic-dev-ai[bot] <191113872+cubic-dev-ai[bot]@users.noreply.github.com>
Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
---
 backend/onyx/db/entities.py                   |  24 ++
 backend/onyx/db/entity_type.py                |  22 +-
 backend/onyx/server/kg/api.py                 |  28 +-
 backend/onyx/server/kg/models.py              |  11 +
 .../tests/integration/tests/kg/test_kg_api.py |  22 +-
 .../app/admin/connector/[ccPairId]/unused.txt |   0
 web/src/app/admin/kg/KGEntityTypes.tsx        | 323 ++++++++++++++++++
 web/src/app/admin/kg/interfaces.ts            |  35 +-
 web/src/app/admin/kg/page.tsx                 | 228 +------------
 web/src/app/admin/kg/utils.ts                 |  10 +
 web/src/components/CollapsibleCard.tsx        |  80 +++++
 web/src/components/ui/dataTable.tsx           |  81 -----
 web/src/components/ui/datePicker.tsx          |   6 +-
 web/tailwind-themes/tailwind.config.js        |   7 +-
 14 files changed, 538 insertions(+), 339 deletions(-)
 delete mode 100644 web/src/app/admin/connector/[ccPairId]/unused.txt
 create mode 100644 web/src/app/admin/kg/KGEntityTypes.tsx
 create mode 100644 web/src/components/CollapsibleCard.tsx
 delete mode 100644 web/src/components/ui/dataTable.tsx

diff --git a/backend/onyx/db/entities.py b/backend/onyx/db/entities.py
index d3e92d43116..9696ce92673 100644
--- a/backend/onyx/db/entities.py
+++ b/backend/onyx/db/entities.py
@@ -308,3 +308,27 @@ def get_entity_name(db_session: Session, entity_id_name: str) -> str | None:
         db_session.query(KGEntity).filter(KGEntity.id_name == entity_id_name).first()
     )
     return entity.name if entity else None
+
+
+def get_entity_stats_by_grounded_source_name(
+    db_session: Session,
+) -> dict[str, tuple[datetime, int]]:
+    """
+    Returns a dict mapping each grounded_source_name to a tuple in which:
+        - the first element is the latest update time across all entities with the same entity-type
+        - the second element is the count of `KGEntity`s
+    """
+    results = (
+        db_session.query(
+            KGEntityType.grounded_source_name,
+            func.count(KGEntity.id_name).label("entities_count"),
+            func.max(KGEntity.time_updated).label("last_updated"),
+        )
+        .join(KGEntityType, KGEntity.entity_type_id_name == KGEntityType.id_name)
+        .group_by(KGEntityType.grounded_source_name)
+        .all()
+    )
+    return {
+        row.grounded_source_name: (row.last_updated, row.entities_count)
+        for row in results
+    }
diff --git a/backend/onyx/db/entity_type.py b/backend/onyx/db/entity_type.py
index 56c8d367496..54b7bfaff35 100644
--- a/backend/onyx/db/entity_type.py
+++ b/backend/onyx/db/entity_type.py
@@ -1,3 +1,5 @@
+from collections import defaultdict
+
 from sqlalchemy import update
 from sqlalchemy.orm import Session
 
@@ -9,6 +11,9 @@
 from onyx.server.kg.models import EntityType
 
 
+_UNGROUNDED_SOURCE_NAME = "Ungrounded"
+
+
 def get_entity_types_with_grounded_source_name(
     db_session: Session,
 ) -> list[KGEntityType]:
@@ -45,7 +50,7 @@ def get_entity_types(
         )
 
 
-def get_configured_entity_types(db_session: Session) -> list[KGEntityType]:
+def get_configured_entity_types(db_session: Session) -> dict[str, list[KGEntityType]]:
     # get entity types from configured sources
     configured_connector_sources = {
         source.value.lower()
@@ -73,12 +78,20 @@ def get_configured_entity_types(db_session: Session) -> list[KGEntityType]:
             elif isinstance(implied_et, str):
                 if implied_et not in entity_type_set:
                     entity_type_set.add(implied_et)
-    return (
+
+    ets = (
         db_session.query(KGEntityType)
         .filter(KGEntityType.id_name.in_(entity_type_set))
         .all()
     )
 
+    et_map = defaultdict(list)
+    for et in ets:
+        key = et.grounded_source_name or _UNGROUNDED_SOURCE_NAME
+        et_map[key].append(et)
+
+    return et_map
+
 
 def update_entity_types_and_related_connectors__commit(
     db_session: Session, updates: list[EntityType]
@@ -99,7 +112,10 @@ def update_entity_types_and_related_connectors__commit(
     configured_entity_types = get_configured_entity_types(db_session=db_session)
 
     active_entity_type_sources = {
-        et.grounded_source_name for et in configured_entity_types if et.active
+        et.grounded_source_name
+        for ets in configured_entity_types.values()
+        for et in ets
+        if et.active
     }
 
     # Update connectors that should be enabled
diff --git a/backend/onyx/server/kg/api.py b/backend/onyx/server/kg/api.py
index c59434efa7b..56081997424 100644
--- a/backend/onyx/server/kg/api.py
+++ b/backend/onyx/server/kg/api.py
@@ -5,6 +5,7 @@
 from onyx.auth.users import current_admin_user
 from onyx.context.search.enums import RecencyBiasSetting
 from onyx.db.engine.sql_engine import get_session
+from onyx.db.entities import get_entity_stats_by_grounded_source_name
 from onyx.db.entity_type import get_configured_entity_types
 from onyx.db.entity_type import update_entity_types_and_related_connectors__commit
 from onyx.db.kg_config import disable_kg
@@ -28,6 +29,8 @@
 from onyx.server.kg.models import EntityType
 from onyx.server.kg.models import KGConfig
 from onyx.server.kg.models import KGConfig as KGConfigAPIModel
+from onyx.server.kg.models import SourceAndEntityTypeView
+from onyx.server.kg.models import SourceStatistics
 from onyx.tools.built_in_tools import get_search_tool
 
 
@@ -54,7 +57,7 @@ def get_kg_exposed(_: User | None = Depends(current_admin_user)) -> bool:
 def reset_kg(
     _: User | None = Depends(current_admin_user),
     db_session: Session = Depends(get_session),
-) -> list[EntityType]:
+) -> SourceAndEntityTypeView:
     reset_full_kg_index__commit(db_session)
     populate_missing_default_entity_types__commit(db_session=db_session)
     return get_kg_entity_types(db_session=db_session)
@@ -173,11 +176,26 @@ def enable_or_disable_kg(
 def get_kg_entity_types(
     _: User | None = Depends(current_admin_user),
     db_session: Session = Depends(get_session),
-) -> list[EntityType]:
+) -> SourceAndEntityTypeView:
     # when using for the first time, populate with default entity types
-    kg_entity_types = get_configured_entity_types(db_session=db_session)
-
-    return [EntityType.from_model(kg_entity_type) for kg_entity_type in kg_entity_types]
+    entity_types = {
+        key: [EntityType.from_model(et) for et in ets]
+        for key, ets in get_configured_entity_types(db_session=db_session).items()
+    }
+
+    source_statistics = {
+        key: SourceStatistics(
+            source_name=key, last_updated=last_updated, entities_count=entities_count
+        )
+        for key, (
+            last_updated,
+            entities_count,
+        ) in get_entity_stats_by_grounded_source_name(db_session=db_session).items()
+    }
+
+    return SourceAndEntityTypeView(
+        source_statistics=source_statistics, entity_types=entity_types
+    )
 
 
 @admin_router.put("/entity-types")
diff --git a/backend/onyx/server/kg/models.py b/backend/onyx/server/kg/models.py
index 4bffa673be6..e4527f8e947 100644
--- a/backend/onyx/server/kg/models.py
+++ b/backend/onyx/server/kg/models.py
@@ -62,3 +62,14 @@ def from_model(
             active=model.active,
             grounded_source_name=model.grounded_source_name,
         )
+
+
+class SourceStatistics(BaseModel):
+    source_name: str
+    last_updated: datetime
+    entities_count: int
+
+
+class SourceAndEntityTypeView(BaseModel):
+    source_statistics: dict[str, SourceStatistics]
+    entity_types: dict[str, list[EntityType]]
diff --git a/backend/tests/integration/tests/kg/test_kg_api.py b/backend/tests/integration/tests/kg/test_kg_api.py
index 46addb9af31..82cdca15fb9 100644
--- a/backend/tests/integration/tests/kg/test_kg_api.py
+++ b/backend/tests/integration/tests/kg/test_kg_api.py
@@ -17,6 +17,7 @@
 from onyx.server.kg.models import EnableKGConfigRequest
 from onyx.server.kg.models import EntityType
 from onyx.server.kg.models import KGConfig as KGConfigAPIModel
+from onyx.server.kg.models import SourceAndEntityTypeView
 from tests.integration.common_utils.constants import API_SERVER_URL
 from tests.integration.common_utils.managers.user import UserManager
 from tests.integration.common_utils.reset import reset_all
@@ -169,6 +170,7 @@ def test_update_kg_entity_types(connectors: None) -> None:
     assert (
         res2.status_code == HTTPStatus.OK
     ), f"Error response: {res2.status_code} - {res2.text}"
+    res2_parsed = SourceAndEntityTypeView.model_validate(res2.json())
 
     # Update entity types
     req3 = [
@@ -210,16 +212,20 @@ def test_update_kg_entity_types(connectors: None) -> None:
     assert (
         res4.status_code == HTTPStatus.OK
     ), f"Error response: {res4.status_code} - {res4.text}"
+    res4_parsed = SourceAndEntityTypeView.model_validate(res4.json())
 
-    new_entity_types = {
-        entity_type["name"]: EntityType.model_validate(entity_type)
-        for entity_type in res4.json()
-    }
+    def to_entity_type_map(map: dict[str, list[EntityType]]) -> dict[str, EntityType]:
+        return {
+            entity_type.name: entity_type
+            for entity_types in map.values()
+            for entity_type in entity_types
+        }
 
-    expected_entity_types = {
-        entity_type["name"]: EntityType.model_validate(entity_type)
-        for entity_type in res2.json()
-    }
+    expected_entity_types = to_entity_type_map(map=res2_parsed.entity_types)
+    new_entity_types = to_entity_type_map(map=res4_parsed.entity_types)
+
+    # These are the updates.
+    # We're just manually updating them.
     expected_entity_types["ACCOUNT"].active = True
     expected_entity_types["ACCOUNT"].description = "Test."
     expected_entity_types["OPPORTUNITY"].active = False
diff --git a/web/src/app/admin/connector/[ccPairId]/unused.txt b/web/src/app/admin/connector/[ccPairId]/unused.txt
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/web/src/app/admin/kg/KGEntityTypes.tsx b/web/src/app/admin/kg/KGEntityTypes.tsx
new file mode 100644
index 00000000000..b674dd91303
--- /dev/null
+++ b/web/src/app/admin/kg/KGEntityTypes.tsx
@@ -0,0 +1,323 @@
+import { SourceIcon } from "@/components/SourceIcon";
+import React, { useEffect, useState } from "react";
+import { Switch } from "@/components/ui/switch";
+import Link from "next/link";
+import { EntityType, SourceAndEntityTypeView } from "./interfaces";
+import CollapsibleCard from "@/components/CollapsibleCard";
+import { ValidSources } from "@/lib/types";
+import { FaCircleQuestion } from "react-icons/fa6";
+import { Input } from "@/components/ui/input";
+import { CheckmarkIcon } from "@/components/icons/icons";
+import { Button } from "@/components/ui/button";
+
+// Utility: Convert capitalized snake case to human readable case
+function snakeToHumanReadable(str: string): string {
+  return (
+    str
+      .toLowerCase()
+      .replace(/_/g, " ")
+      .replace(/\b\w/g, (match) => match.toUpperCase())
+      // # TODO (@raunakab)
+      // Special case to replace all instances of "Pr" with "PR".
+      // This is a *dumb* implementation. If there exists a string that starts with "Pr" (e.g., "Prompt"),
+      // then this line will stupidly convert it to "PRompt".
+      // Fix this later (or if this becomes a problem lol).
+      .replace("Pr", "PR")
+  );
+}
+
+// Custom Header Component
+function TableHeader() {
+  return (
+    <div className="grid grid-cols-12 gap-y-4 px-8 p-4 border-b border-neutral-700 font-semibold text-sm bg-neutral-900 text-neutral-500">
+      <div className="col-span-1">Entity Name</div>
+      <div className="col-span-10">Description</div>
+      <div className="col-span-1 flex flex-1 justify-center">Active</div>
+    </div>
+  );
+}
+
+// Custom Row Component
+function TableRow({ entityType }: { entityType: EntityType }) {
+  const [entityTypeState, setEntityTypeState] = useState(entityType);
+  const [descriptionSavingState, setDescriptionSavingState] = useState<
+    "saving" | "saved" | "failed" | undefined
+  >(undefined);
+
+  const [timer, setTimer] = useState<NodeJS.Timeout | null>(null);
+  const [checkmarkVisible, setCheckmarkVisible] = useState(false);
+  const [hasMounted, setHasMounted] = useState(false);
+
+  const handleToggle = async (checked: boolean) => {
+    const response = await fetch("/api/admin/kg/entity-types", {
+      method: "PUT",
+      headers: { "Content-Type": "application/json" },
+      body: JSON.stringify([{ ...entityType, active: checked }]),
+    });
+
+    if (!response.ok) return;
+
+    setEntityTypeState({ ...entityTypeState, active: checked });
+  };
+
+  const handleDescriptionChange = async (description: string) => {
+    try {
+      const response = await fetch("/api/admin/kg/entity-types", {
+        method: "PUT",
+        headers: { "Content-Type": "application/json" },
+        body: JSON.stringify([{ ...entityType, description }]),
+      });
+      if (response.ok) {
+        setDescriptionSavingState("saved");
+        setCheckmarkVisible(true);
+        setTimeout(() => setCheckmarkVisible(false), 1000);
+      } else {
+        setDescriptionSavingState("failed");
+        setCheckmarkVisible(false);
+      }
+    } catch {
+      setDescriptionSavingState("failed");
+      setCheckmarkVisible(false);
+    } finally {
+      setTimeout(() => setDescriptionSavingState(undefined), 1000);
+    }
+  };
+
+  useEffect(() => {
+    if (!hasMounted) {
+      setHasMounted(true);
+      return;
+    }
+    if (timer) clearTimeout(timer);
+    setTimer(
+      setTimeout(() => {
+        setDescriptionSavingState("saving");
+        setCheckmarkVisible(false);
+        setTimer(
+          setTimeout(
+            () => handleDescriptionChange(entityTypeState.description),
+            500
+          )
+        );
+      }, 1000)
+    );
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, [entityTypeState.description]);
+
+  return (
+    <div className="hover:bg-accent-background-hovered transition-colors duration-200 ease-in-out">
+      <div className="grid grid-cols-12 px-8 py-4">
+        <div
+          className={`grid grid-cols-11 col-span-11 transition-opacity duration-150 ease-in-out ${entityTypeState.active ? "" : "opacity-60"}`}
+        >
+          <div className="col-span-1 flex items-center">
+            <span className="font-medium text-sm">
+              {snakeToHumanReadable(entityType.name)}
+            </span>
+          </div>
+          <div className="col-span-10 relative">
+            <Input
+              disabled={!entityTypeState.active}
+              className="w-full px-3 py-2 border focus:ring-2 transition-shadow"
+              defaultValue={entityType.description}
+              onChange={(e) =>
+                setEntityTypeState({
+                  ...entityTypeState,
+                  description: e.target.value,
+                })
+              }
+              onKeyDown={async (e) => {
+                if (e.key === "Enter") {
+                  e.preventDefault();
+                  if (timer) {
+                    clearTimeout(timer);
+                    setTimer(null);
+                  }
+                  setDescriptionSavingState("saving");
+                  setCheckmarkVisible(false);
+                  await handleDescriptionChange(
+                    (e.target as HTMLInputElement).value
+                  );
+                }
+              }}
+            />
+            <span
+              className="absolute right-3 top-1/2 -translate-y-1/2 w-5 h-5"
+              style={{ pointerEvents: "none" }}
+            >
+              <span
+                className={`absolute inset-0 flex items-center justify-center transition-opacity duration-400 ease-in-out ${
+                  descriptionSavingState === "saving" && hasMounted
+                    ? "opacity-100"
+                    : "opacity-0"
+                }`}
+                style={{ zIndex: 1 }}
+              >
+                <span className="inline-block w-4 h-4 align-middle border-2 border-blue-400 border-t-transparent rounded-full animate-spin" />
+              </span>
+              <span
+                className={`absolute inset-0 flex items-center justify-center transition-opacity duration-400 ease-in-out ${
+                  checkmarkVisible ? "opacity-100" : "opacity-0"
+                }`}
+                style={{ zIndex: 2 }}
+              >
+                <CheckmarkIcon size={16} className="text-green-400" />
+              </span>
+            </span>
+          </div>
+        </div>
+        <div className="grid col-span-1 items-center justify-center">
+          <Switch
+            checked={entityTypeState.active}
+            onCheckedChange={handleToggle}
+          />
+        </div>
+      </div>
+    </div>
+  );
+}
+
+interface KGEntityTypesProps {
+  sourceAndEntityTypes: SourceAndEntityTypeView;
+}
+
+export default function KGEntityTypes({
+  sourceAndEntityTypes,
+}: KGEntityTypesProps) {
+  // State to control open/close of all CollapsibleCards
+  const [openCards, setOpenCards] = useState<{ [key: string]: boolean }>({});
+  // State for search query
+  const [search, setSearch] = useState("");
+
+  // Initialize openCards state when data changes
+  useEffect(() => {
+    const initialState: { [key: string]: boolean } = {};
+    Object.keys(sourceAndEntityTypes.entity_types).forEach((key) => {
+      initialState[key] = true;
+    });
+    setOpenCards(initialState);
+  }, [sourceAndEntityTypes]);
+
+  // Handlers for expand/collapse all
+  const handleExpandAll = () => {
+    const newState: { [key: string]: boolean } = {};
+    Object.keys(sourceAndEntityTypes.entity_types).forEach((key) => {
+      newState[key] = true;
+    });
+    setOpenCards(newState);
+  };
+  const handleCollapseAll = () => {
+    const newState: { [key: string]: boolean } = {};
+    Object.keys(sourceAndEntityTypes.entity_types).forEach((key) => {
+      newState[key] = false;
+    });
+    setOpenCards(newState);
+  };
+
+  // Determine if all cards are closed
+  const allClosed = Object.values(openCards).every((v) => v === false);
+
+  return (
+    <div className="flex flex-col gap-y-4 w-full">
+      <div className="flex flex-row items-center gap-x-1.5 mb-2">
+        <input
+          type="text"
+          className="ml-1 w-96 h-9 border border-border flex-none rounded-md bg-background-50 px-3 text-sm shadow-sm transition-colors placeholder:text-muted-foreground focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring"
+          placeholder="Search source type..."
+          value={search}
+          onChange={(e) => setSearch(e.target.value)}
+        />
+        <Button
+          className="h-9"
+          variant="default"
+          onClick={allClosed ? handleExpandAll : handleCollapseAll}
+        >
+          {allClosed ? "Expand All" : "Collapse All"}
+        </Button>
+      </div>
+      <div className="flex flex-col gap-y-4 w-full">
+        {Object.entries(sourceAndEntityTypes.entity_types).length === 0 ? (
+          <div className="flex flex-col gap-y-4">
+            <p>No results available.</p>
+            <p>
+              To configure Knowledge Graph, first connect some{" "}
+              <Link href="/admin/add-connector" className="underline">
+                Connectors.
+              </Link>
+            </p>
+          </div>
+        ) : (
+          Object.entries(sourceAndEntityTypes.entity_types)
+            .filter(([key]) =>
+              snakeToHumanReadable(key)
+                .toLowerCase()
+                .includes(search.toLowerCase())
+            )
+            .sort(([keyA], [keyB]) => keyA.localeCompare(keyB))
+            .map(([key, entityTypesArr]) => {
+              const stats = sourceAndEntityTypes.source_statistics[key] ?? {
+                source_name: key,
+                last_updated: undefined,
+                entities_count: 0,
+              };
+              return (
+                <div key={key}>
+                  <CollapsibleCard
+                    header={
+                      <span className="font-semibold text-lg flex flex-row gap-x-4 items-center">
+                        {Object.values(ValidSources).includes(
+                          key as ValidSources
+                        ) ? (
+                          <SourceIcon
+                            sourceType={key as ValidSources}
+                            iconSize={25}
+                          />
+                        ) : (
+                          <FaCircleQuestion size={25} />
+                        )}
+                        {snakeToHumanReadable(key)}
+                        <span className="ml-auto flex flex-row gap-x-16 items-center pr-16">
+                          <span className="flex flex-col items-end">
+                            <span className="text-sm text-neutral-400 mb-0.5">
+                              Entities Count
+                            </span>
+                            <span className="text-xl text-neutral-100 font-semibold flex w-full">
+                              {stats.entities_count}
+                            </span>
+                          </span>
+                          <span className="flex flex-col items-end">
+                            <span className="text-sm text-neutral-400 mb-0.5">
+                              Last Updated
+                            </span>
+                            <span className="text-xl text-neutral-100 font-semibold flex w-full">
+                              {stats.last_updated
+                                ? new Date(stats.last_updated).toLocaleString()
+                                : "N/A"}
+                            </span>
+                          </span>
+                        </span>
+                      </span>
+                    }
+                    // Use a key that changes with openCards[key] to force remount and update defaultOpen
+                    key={`${key}-${openCards[key]}`}
+                    defaultOpen={
+                      openCards[key] !== undefined ? openCards[key] : true
+                    }
+                  >
+                    <div className="w-full">
+                      <TableHeader />
+                      {entityTypesArr.map(
+                        (entityType: EntityType, index: number) => (
+                          <TableRow key={index} entityType={entityType} />
+                        )
+                      )}
+                    </div>
+                  </CollapsibleCard>
+                </div>
+              );
+            })
+        )}
+      </div>
+    </div>
+  );
+}
diff --git a/web/src/app/admin/kg/interfaces.ts b/web/src/app/admin/kg/interfaces.ts
index fa2985a2968..aca740d96ca 100644
--- a/web/src/app/admin/kg/interfaces.ts
+++ b/web/src/app/admin/kg/interfaces.ts
@@ -16,31 +16,20 @@ export type KGConfigRaw = {
 
 export type EntityTypeValues = { [key: string]: EntityType };
 
+export type SourceAndEntityTypeView = {
+  source_statistics: Record<string, SourceStatistics>;
+  entity_types: Record<string, EntityType[]>;
+};
+
+export type SourceStatistics = {
+  source_name: string;
+  last_updated: string;
+  entities_count: number;
+};
+
 export type EntityType = {
   name: string;
   description: string;
   active: boolean;
+  grounded_source_name: string;
 };
-
-export function sanitizeKGConfig(raw: KGConfigRaw): KGConfig {
-  const coverage_start = new Date(raw.coverage_start);
-
-  return {
-    ...raw,
-    coverage_start,
-  };
-}
-
-export function sanitizeKGEntityTypes(
-  entityTypes: EntityType[]
-): [EntityTypeValues, EntityType[]] {
-  const entityTypeMap: EntityTypeValues = {};
-  for (const entityType of entityTypes) {
-    entityTypeMap[entityType.name.toLowerCase()] = entityType;
-  }
-
-  const sortedData = Object.values(entityTypeMap);
-  sortedData.sort((a, b) => a.name.localeCompare(b.name));
-
-  return [entityTypeMap, sortedData];
-}
diff --git a/web/src/app/admin/kg/page.tsx b/web/src/app/admin/kg/page.tsx
index 53f51329889..e83c11be6a9 100644
--- a/web/src/app/admin/kg/page.tsx
+++ b/web/src/app/admin/kg/page.tsx
@@ -5,7 +5,6 @@ import { AdminPageTitle } from "@/components/admin/Title";
 import {
   DatePickerField,
   FieldLabel,
-  TextAreaField,
   TextArrayField,
   TextFormField,
 } from "@/components/Field";
@@ -13,33 +12,19 @@ import { BrainIcon } from "@/components/icons/icons";
 import { Modal } from "@/components/Modal";
 import { Button } from "@/components/ui/button";
 import { SwitchField } from "@/components/ui/switch";
-import {
-  Form,
-  Formik,
-  FormikProps,
-  FormikState,
-  useFormikContext,
-} from "formik";
+import { Form, Formik, FormikState, useFormikContext } from "formik";
 import { useState } from "react";
 import { FiSettings } from "react-icons/fi";
 import * as Yup from "yup";
-import {
-  EntityType,
-  KGConfig,
-  EntityTypeValues,
-  sanitizeKGConfig,
-  KGConfigRaw,
-  sanitizeKGEntityTypes,
-} from "./interfaces";
-import { ColumnDef } from "@tanstack/react-table";
-import { DataTable } from "@/components/ui/dataTable";
+import { KGConfig, KGConfigRaw, SourceAndEntityTypeView } from "./interfaces";
+import { sanitizeKGConfig } from "./utils";
 import useSWR from "swr";
 import { errorHandlingFetcher } from "@/lib/fetcher";
 import { PopupSpec, usePopup } from "@/components/admin/connectors/Popup";
 import Title from "@/components/ui/title";
 import { redirect } from "next/navigation";
-import Link from "next/link";
 import { useIsKGExposed } from "./utils";
+import KGEntityTypes from "./KGEntityTypes";
 
 function createDomainField(
   name: string,
@@ -225,191 +210,6 @@ function KGConfiguration({
   );
 }
 
-function KGEntityTypes({
-  kgEntityTypes,
-  sortedKGEntityTypes: sorted,
-  setPopup,
-  refreshKGEntityTypes,
-}: {
-  kgEntityTypes: EntityTypeValues;
-  sortedKGEntityTypes: EntityType[];
-  setPopup?: (spec: PopupSpec | null) => void;
-  refreshKGEntityTypes?: () => void;
-}) {
-  const [sortedKGEntityTypes, setSortedKGEntityTypes] = useState(sorted);
-  console.log({ sortedKGEntityTypes });
-
-  const columns: ColumnDef<EntityType>[] = [
-    {
-      accessorKey: "name",
-      header: "Name",
-    },
-    {
-      accessorKey: "description",
-      header: "Description",
-      cell: ({ row }) => (
-        <div className="h-20 w-[800px]">
-          <TextAreaField
-            name={`${row.original.name.toLowerCase()}.description`}
-            className="resize-none border rounded-md bg-background text-text focus:ring-2 focus:ring-blue-500 transition duration-200"
-          />
-        </div>
-      ),
-    },
-    {
-      accessorKey: "active",
-      header: "Active",
-      cell: ({ row }) => (
-        <SwitchField name={`${row.original.name.toLowerCase()}.active`} />
-      ),
-    },
-  ];
-
-  const validationSchema = Yup.array(
-    Yup.object({
-      active: Yup.boolean().required(),
-    })
-  );
-
-  const onSubmit = async (
-    values: EntityTypeValues,
-    {
-      resetForm,
-    }: {
-      resetForm: (nextState?: Partial<FormikState<EntityTypeValues>>) => void;
-    }
-  ) => {
-    const diffs: EntityType[] = [];
-
-    for (const key in kgEntityTypes) {
-      const initialValue = kgEntityTypes[key]!;
-      const currentValue = values[key]!;
-      const equals =
-        initialValue.description === currentValue.description &&
-        initialValue.active === currentValue.active;
-      if (!equals) {
-        diffs.push(currentValue);
-      }
-    }
-
-    if (diffs.length === 0) return;
-
-    const response = await fetch("/api/admin/kg/entity-types", {
-      method: "PUT",
-      headers: {
-        "Content-Type": "application/json",
-      },
-      body: JSON.stringify(diffs),
-    });
-
-    if (!response.ok) {
-      const errorMsg = (await response.json()).detail;
-      console.warn({ errorMsg });
-      setPopup?.({
-        message: "Failed to configure Entity Types.",
-        type: "error",
-      });
-      return;
-    }
-
-    setPopup?.({
-      message: "Successfully updated Entity Types.",
-      type: "success",
-    });
-
-    refreshKGEntityTypes?.();
-
-    resetForm({ values });
-  };
-
-  const reset = async (props: FormikProps<EntityTypeValues>) => {
-    const result = await fetch("/api/admin/kg/reset", { method: "PUT" });
-
-    if (!result.ok) {
-      setPopup?.({
-        message: "Failed to reset Knowledge Graph.",
-        type: "error",
-      });
-      return;
-    }
-
-    const rawData = (await result.json()) as EntityType[];
-    const [newEntityTypes, newSortedEntityTypes] =
-      sanitizeKGEntityTypes(rawData);
-    props.resetForm({ values: newEntityTypes });
-    setSortedKGEntityTypes(newSortedEntityTypes);
-
-    setPopup?.({
-      message: "Successfully reset Knowledge Graph.",
-      type: "success",
-    });
-
-    refreshKGEntityTypes?.();
-  };
-
-  return (
-    <Formik
-      initialValues={kgEntityTypes}
-      validationSchema={validationSchema}
-      onSubmit={onSubmit}
-    >
-      {(props) => (
-        <Form className="flex flex-col gap-y-8">
-          <CardSection className="flex flex-col w-min px-10 gap-y-4">
-            <DataTable
-              columns={columns}
-              data={sortedKGEntityTypes}
-              emptyMessage={
-                <div className="flex flex-col gap-y-4">
-                  <p>No results available.</p>
-                  <p>
-                    To configure Knowledge Graph, first connect some {` `}
-                    <Link href={`/admin/add-connector`} className="underline">
-                      Connectors.
-                    </Link>
-                  </p>
-                </div>
-              }
-            />
-            <div className="flex flex-row items-center gap-x-4">
-              <Button type="submit" variant="submit" disabled={!props.dirty}>
-                Save
-              </Button>
-              <Button
-                variant="outline"
-                disabled={!props.dirty}
-                onClick={() => props.resetForm()}
-              >
-                Cancel
-              </Button>
-            </div>
-          </CardSection>
-          <div className="border border-red-700 p-8 rounded-md flex flex-col w-full">
-            <p className="text-2xl font-bold mb-4 text-text border-b border-b-border pb-2">
-              Danger
-            </p>
-            <div className="flex flex-col gap-y-4">
-              <p>
-                Resetting will delete all extracted entities and relationships
-                and deactivate all entity types. After reset, you can reactivate
-                entity types to begin populating the Knowledge Graph again.
-              </p>
-              <Button
-                type="button"
-                variant="destructive"
-                className="w-min"
-                onClick={() => reset(props)}
-              >
-                Reset Knowledge Graph
-              </Button>
-            </div>
-          </div>
-        </Form>
-      )}
-    </Formik>
-  );
-}
-
 function Main() {
   // Data:
   const {
@@ -418,10 +218,13 @@ function Main() {
     mutate: configMutate,
   } = useSWR<KGConfigRaw>("/api/admin/kg/config", errorHandlingFetcher);
   const {
-    data: entityTypesData,
+    data: sourceAndEntityTypesData,
     isLoading: entityTypesIsLoading,
     mutate: entityTypesMutate,
-  } = useSWR<EntityType[]>("/api/admin/kg/entity-types", errorHandlingFetcher);
+  } = useSWR<SourceAndEntityTypeView>(
+    "/api/admin/kg/entity-types",
+    errorHandlingFetcher
+  );
 
   // Local State:
   const { popup, setPopup } = usePopup();
@@ -431,14 +234,12 @@ function Main() {
     configIsLoading ||
     entityTypesIsLoading ||
     !configData ||
-    !entityTypesData
+    !sourceAndEntityTypesData
   ) {
     return <></>;
   }
 
   const kgConfig = sanitizeKGConfig(configData);
-  const [kgEntityTypes, sortedKGEntityTypes] =
-    sanitizeKGEntityTypes(entityTypesData);
 
   return (
     <div className="flex flex-col py-4 gap-y-8">
@@ -484,15 +285,10 @@ function Main() {
       </CardSection>
       {kgConfig.enabled && (
         <>
-          <p className="text-2xl font-bold mb-4 text-text border-b border-b-border pb-2">
+          <p className="text-2xl font-bold text-text border-b border-b-border">
             Entity Types
           </p>
-          <KGEntityTypes
-            kgEntityTypes={kgEntityTypes}
-            sortedKGEntityTypes={sortedKGEntityTypes}
-            setPopup={setPopup}
-            refreshKGEntityTypes={entityTypesMutate}
-          />
+          <KGEntityTypes sourceAndEntityTypes={sourceAndEntityTypesData} />
         </>
       )}
       {configureModalShown && (
diff --git a/web/src/app/admin/kg/utils.ts b/web/src/app/admin/kg/utils.ts
index 8142db581ce..855aa0f0c98 100644
--- a/web/src/app/admin/kg/utils.ts
+++ b/web/src/app/admin/kg/utils.ts
@@ -1,6 +1,7 @@
 import { useUser } from "@/components/user/UserProvider";
 import { errorHandlingFetcher } from "@/lib/fetcher";
 import useSWR from "swr";
+import { KGConfig, KGConfigRaw } from "./interfaces";
 
 export type KgExposedStatus = { kgExposed: boolean; isLoading: boolean };
 
@@ -17,3 +18,12 @@ export function useIsKGExposed(): KgExposedStatus {
   );
   return { kgExposed: kgExposedRaw ?? false, isLoading };
 }
+
+export function sanitizeKGConfig(raw: KGConfigRaw): KGConfig {
+  const coverage_start = new Date(raw.coverage_start);
+
+  return {
+    ...raw,
+    coverage_start,
+  };
+}
diff --git a/web/src/components/CollapsibleCard.tsx b/web/src/components/CollapsibleCard.tsx
new file mode 100644
index 00000000000..0fe029616e3
--- /dev/null
+++ b/web/src/components/CollapsibleCard.tsx
@@ -0,0 +1,80 @@
+import { ChevronDown } from "lucide-react";
+import React, { useState, ReactNode, useRef, useLayoutEffect } from "react";
+
+interface CollapsibleCardProps {
+  header: JSX.Element;
+  children: ReactNode;
+  defaultOpen?: boolean;
+  className?: string;
+}
+
+/**
+ * Renders a "collapsible" card which, when collapsed, is meant to showcase very "high-level" information (e.g., the name), but when expanded, can show a list of sub-items which are all related to one another.
+ */
+export default function CollapsibleCard({
+  header,
+  children,
+  defaultOpen = false,
+  className = "",
+}: CollapsibleCardProps) {
+  const [open, setOpen] = useState(defaultOpen);
+  const [maxHeight, setMaxHeight] = useState<string | undefined>(undefined);
+  const contentRef = useRef<HTMLDivElement>(null);
+
+  // Update maxHeight for animation when open/close
+  useLayoutEffect(() => {
+    if (open && contentRef.current) {
+      setMaxHeight(contentRef.current.scrollHeight + "px");
+    } else {
+      setMaxHeight("0px");
+    }
+  }, [open, children]);
+
+  // If content changes size while open, update maxHeight
+  useLayoutEffect(() => {
+    if (open && contentRef.current) {
+      const handleResize = () => {
+        setMaxHeight(contentRef.current!.scrollHeight + "px");
+      };
+      handleResize();
+      window.addEventListener("resize", handleResize);
+      return () => window.removeEventListener("resize", handleResize);
+    }
+  }, [open, children]);
+
+  return (
+    <div
+      className={`rounded-lg border border-border bg-background shadow-md transition-all ${className}`}
+    >
+      <button
+        type="button"
+        className="w-full flex items-center px-8 py-6 text-left focus:outline-none focus:ring-2 focus:ring-accent rounded-t-lg bg-accent-background hover:bg-accent-background-hovered transition-colors"
+        onClick={() => setOpen((prev) => !prev)}
+        aria-expanded={open}
+      >
+        <div className="flex-1">{header}</div>
+        <span
+          className="ml-3 transition-transform flex-shrink-0"
+          style={{ transform: open ? "rotate(0deg)" : "rotate(-90deg)" }}
+        >
+          <ChevronDown size={20} />
+        </span>
+      </button>
+      <div
+        ref={contentRef}
+        style={{
+          maxHeight,
+          opacity: open ? 1 : 0,
+          overflow: "hidden",
+          transition:
+            "max-height 0.35s cubic-bezier(0.4, 0, 0.2, 1), opacity 0.25s cubic-bezier(0.4, 0, 0.2, 1)",
+        }}
+        aria-hidden={!open}
+      >
+        <div className="border-t border-border bg-background rounded-b-lg">
+          {children}
+        </div>
+      </div>
+    </div>
+  );
+}
diff --git a/web/src/components/ui/dataTable.tsx b/web/src/components/ui/dataTable.tsx
deleted file mode 100644
index ccc786981d5..00000000000
--- a/web/src/components/ui/dataTable.tsx
+++ /dev/null
@@ -1,81 +0,0 @@
-"use client";
-
-import {
-  ColumnDef,
-  flexRender,
-  getCoreRowModel,
-  useReactTable,
-} from "@tanstack/react-table";
-
-import {
-  Table,
-  TableBody,
-  TableCell,
-  TableHead,
-  TableHeader,
-  TableRow,
-} from "@/components/ui/table";
-
-interface DataTableProps<TData, TValue> {
-  columns: ColumnDef<TData, TValue>[];
-  data: TData[];
-  emptyMessage?: string | JSX.Element;
-}
-
-export function DataTable<TData, TValue>({
-  columns,
-  data,
-  emptyMessage = "No results.",
-}: DataTableProps<TData, TValue>) {
-  const table = useReactTable({
-    data,
-    columns,
-    getCoreRowModel: getCoreRowModel(),
-  });
-
-  return (
-    <div className="rounded-md border-0">
-      <Table>
-        <TableHeader className="border-0">
-          {table.getHeaderGroups().map((headerGroup) => (
-            <TableRow key={headerGroup.id} className="border-0">
-              {headerGroup.headers.map((header) => (
-                <TableHead key={header.id} className="border-0">
-                  {header.isPlaceholder
-                    ? null
-                    : flexRender(
-                        header.column.columnDef.header,
-                        header.getContext()
-                      )}
-                </TableHead>
-              ))}
-            </TableRow>
-          ))}
-        </TableHeader>
-        <TableBody className="border-0">
-          {table.getRowModel().rows?.length ? (
-            table.getRowModel().rows.map((row) => (
-              <TableRow
-                key={row.id}
-                data-state={row.getIsSelected() && "selected"}
-                className="border-0"
-              >
-                {row.getVisibleCells().map((cell) => (
-                  <TableCell key={cell.id}>
-                    {flexRender(cell.column.columnDef.cell, cell.getContext())}
-                  </TableCell>
-                ))}
-              </TableRow>
-            ))
-          ) : (
-            <TableRow className="border-0">
-              <TableCell colSpan={columns.length} className="h-24 text-center">
-                {emptyMessage}
-              </TableCell>
-            </TableRow>
-          )}
-        </TableBody>
-      </Table>
-    </div>
-  );
-}
diff --git a/web/src/components/ui/datePicker.tsx b/web/src/components/ui/datePicker.tsx
index e169d12d882..370ac9fca5c 100644
--- a/web/src/components/ui/datePicker.tsx
+++ b/web/src/components/ui/datePicker.tsx
@@ -37,9 +37,10 @@ export function DatePicker({
     .fill(currYear)
     .map((currYear, index) => currYear - index);
   const [shownDate, setShownDate] = useState(selectedDate ?? new Date());
+  const [open, setOpen] = useState(false);
 
   return (
-    <Popover>
+    <Popover open={open} onOpenChange={setOpen}>
       <PopoverTrigger asChild>
         <Button
           icon={FiCalendar}
@@ -50,7 +51,7 @@ export function DatePicker({
           {selectedDate ? selectedDate.toLocaleDateString() : "Select Date"}
         </Button>
       </PopoverTrigger>
-      <PopoverContent className="flex w-full flex-col p-2 gap-y-2">
+      <PopoverContent className="flex w-full flex-col p-2 gap-y-2 data-[state=open]:animate-fade-in-scale data-[state=closed]:animate-fade-out-scale">
         <div className="flex flex-row items-center gap-x-2">
           <Select
             onValueChange={(value) => {
@@ -85,6 +86,7 @@ export function DatePicker({
           onDayClick={(date) => {
             setShownDate(date);
             setSelectedDate(date);
+            setOpen(false);
           }}
           month={shownDate}
           onMonthChange={(date) => {
diff --git a/web/tailwind-themes/tailwind.config.js b/web/tailwind-themes/tailwind.config.js
index d831c627af1..7bbae2796c2 100644
--- a/web/tailwind-themes/tailwind.config.js
+++ b/web/tailwind-themes/tailwind.config.js
@@ -34,12 +34,17 @@ module.exports = {
           "0%": { opacity: "0", transform: "scale(0.95)" },
           "100%": { opacity: "1", transform: "scale(1)" },
         },
+        "fade-out-scale": {
+          "0%": { opacity: "1", transform: "scale(1)" },
+          "100%": { opacity: "0", transform: "scale(0.95)" },
+        },
       },
       animation: {
         "fade-in-up": "fadeInUp 0.5s ease-out",
         "subtle-pulse": "subtle-pulse 2s ease-in-out infinite",
         pulse: "pulse 2s cubic-bezier(0.4, 0, 0.6, 1) infinite",
-        "fade-in-scale": "fade-in-scale 0.3s ease-out",
+        "fade-in-scale": "fade-in-scale 0.2s ease-out forwards",
+        "fade-out-scale": "fade-out-scale 0.2s ease-in forwards",
       },
 
       gradientColorStops: {

From 83a35440a3d697f8f9877bae82ddba3736feee25 Mon Sep 17 00:00:00 2001
From: Raunak Bhagat <r@rabh.io>
Date: Mon, 21 Jul 2025 17:50:28 -0700
Subject: [PATCH 15/78] Make `from_.user` optional (use "Unknown User") if not
 found (#5051)

---
 backend/onyx/connectors/teams/connector.py | 11 ++++++++---
 backend/onyx/connectors/teams/models.py    |  2 +-
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/backend/onyx/connectors/teams/connector.py b/backend/onyx/connectors/teams/connector.py
index ae1f6dc7156..4a9a8f19187 100644
--- a/backend/onyx/connectors/teams/connector.py
+++ b/backend/onyx/connectors/teams/connector.py
@@ -286,9 +286,14 @@ def retrieve_all_slim_documents(
 
 
 def _construct_semantic_identifier(channel: Channel, top_message: Message) -> str:
-    top_message_user_name = (
-        top_message.from_.user.display_name if top_message.from_ else "Unknown User"
-    )
+    top_message_user_name: str
+
+    if top_message.from_ and top_message.from_.user:
+        top_message_user_name = top_message.from_.user.display_name
+    else:
+        logger.warn(f"Message {top_message=} has no `from.user` field")
+        top_message_user_name = "Unknown User"
+
     top_message_content = top_message.body.content or ""
     top_message_subject = top_message.subject or "Unknown Subject"
     channel_name = channel.properties.get("displayName", "Unknown")
diff --git a/backend/onyx/connectors/teams/models.py b/backend/onyx/connectors/teams/models.py
index 617266a7d39..a03874a477f 100644
--- a/backend/onyx/connectors/teams/models.py
+++ b/backend/onyx/connectors/teams/models.py
@@ -27,7 +27,7 @@ class User(BaseModel):
 
 
 class From(BaseModel):
-    user: User
+    user: User | None
 
     model_config = ConfigDict(
         alias_generator=to_camel,

From e407c67a23a20159a511701a0770defcb30bdb74 Mon Sep 17 00:00:00 2001
From: Evan Lohn <evan@danswer.ai>
Date: Mon, 21 Jul 2025 20:33:25 -0700
Subject: [PATCH 16/78] feat: connector indexing decoupling (#4893)

* WIP

* renamed and moved tasks (WIP)

* minio migration

* bug fixes and finally add document batch storage

* WIP: can suceed but status is error

* WIP

* import fixes

* working v1 of decoupled

* catastrophe handling

* refactor

* remove unused db session in prep for new approach

* renaming and docstrings (untested)

* renames

* WIP with no more indexing fences

* robustness improvements

* clean up rebase

* migration and salesforce rate limits

* minor tweaks

* test fix

* connector pausing behavior

* correct checkpoint resumption logic

* cleanups in docfetching

* add heartbeat file

* update template jsonc

* deployment fixes

* fix vespa httpx pool

* error handling

* cosmetic fixes

* dumb

* logging improvements and non checkpointed connector fixes

* didnt save

* misc fixes

* fix import

* fix deletion of old files

* add in attempt prefix

* fix attempt prefix

* tiny log improvement

* minor changes

* fixed resumption behavior

* passing int tests

* fix unit test

* fixed unit tests

* trying timeout bump to see if int tests pass

* trying timeout bump to see if int tests pass

* fix autodiscovery

* helm chart fixes

* helm and logging
---
 .vscode/launch.template.jsonc                 |   99 +-
 .../12635f6655b7_drive_canonical_ids.py       |    4 +-
 .../2f95e36923e6_add_indexing_coordination.py |  115 ++
 .../c9e2cd766c29_add_s3_file_store_table.py   |    4 +-
 .../ee/onyx/background/celery/apps/heavy.py   |    2 +-
 .../tasks/doc_permission_syncing/tasks.py     |    2 +-
 .../tasks/external_group_syncing/tasks.py     |    2 +-
 backend/ee/onyx/db/usage_export.py            |    3 +-
 .../ee/onyx/server/enterprise_settings/api.py |    7 +-
 .../onyx/server/enterprise_settings/store.py  |    7 +-
 backend/ee/onyx/server/query_history/api.py   |    4 +-
 .../onyx/server/reporting/usage_export_api.py |    2 +-
 .../reporting/usage_export_generation.py      |    2 +-
 backend/ee/onyx/server/seeding.py             |    6 +-
 .../onyx/background/celery/apps/app_base.py   |   12 +-
 .../background/celery/apps/docfetching.py     |  102 ++
 .../apps/{indexing.py => docprocessing.py}    |    8 +-
 backend/onyx/background/celery/apps/light.py  |    2 +-
 .../onyx/background/celery/apps/primary.py    |   61 +-
 .../onyx/background/celery/celery_redis.py    |    2 +-
 .../background/celery/configs/docfetching.py  |   22 +
 .../configs/{indexing.py => docprocessing.py} |    4 +-
 .../celery/tasks/connector_deletion/tasks.py  |   32 +-
 .../celery/tasks/docfetching/tasks.py         |  675 ++++++++
 .../celery/tasks/docprocessing/heartbeat.py   |   36 +
 .../celery/tasks/docprocessing/tasks.py       | 1283 +++++++++++++++
 .../{indexing => docprocessing}/utils.py      |  333 +---
 .../background/celery/tasks/indexing/tasks.py | 1424 -----------------
 .../onyx/background/celery/tasks/models.py    |  110 ++
 .../celery/tasks/monitoring/tasks.py          |   24 +-
 .../background/celery/tasks/pruning/tasks.py  |    4 +-
 .../{indexing.py => docfetching.py}           |    2 +-
 .../celery/versioned_apps/docprocessing.py    |   18 +
 .../indexing/checkpointing_utils.py           |   21 +-
 .../{run_indexing.py => run_docfetching.py}   |  618 ++++++-
 backend/onyx/chat/process_message.py          |    4 +-
 backend/onyx/configs/app_configs.py           |   25 +-
 backend/onyx/configs/constants.py             |   16 +-
 backend/onyx/connectors/blob/connector.py     |   41 +-
 backend/onyx/connectors/confluence/utils.py   |   23 +-
 backend/onyx/connectors/file/connector.py     |   58 +-
 .../onyx/connectors/google_drive/connector.py |  140 ++
 .../connectors/google_drive/doc_conversion.py |   39 +-
 .../onyx/connectors/google_site/connector.py  |    6 +-
 backend/onyx/connectors/models.py             |   26 +
 backend/onyx/connectors/notion/connector.py   |    2 +-
 .../onyx/connectors/salesforce/connector.py   |    2 +-
 .../connectors/salesforce/onyx_salesforce.py  |   78 +-
 .../connectors/salesforce/salesforce_calls.py |   30 +
 .../search/postprocessing/postprocessing.py   |   23 +-
 backend/onyx/db/chat.py                       |    2 +-
 backend/onyx/db/connector_credential_pair.py  |    3 +
 backend/onyx/db/document.py                   |    4 +-
 backend/onyx/db/engine/sql_engine.py          |   12 +
 backend/onyx/db/file_record.py                |   11 +
 backend/onyx/db/index_attempt.py              |   56 +-
 backend/onyx/db/indexing_coordination.py      |  307 ++++
 backend/onyx/db/models.py                     |   46 +-
 backend/onyx/db/user_documents.py             |    2 +-
 backend/onyx/document_index/interfaces.py     |    2 +-
 backend/onyx/file_processing/image_utils.py   |    6 +-
 .../onyx/file_store/document_batch_storage.py |  228 +++
 backend/onyx/file_store/file_store.py         |  202 ++-
 backend/onyx/file_store/utils.py              |   80 +-
 backend/onyx/httpx/httpx_pool.py              |   13 +-
 backend/onyx/indexing/embedder.py             |    6 +
 backend/onyx/indexing/indexing_pipeline.py    |   72 +-
 backend/onyx/main.py                          |    2 +-
 .../search_nlp_models.py                      |    5 +-
 backend/onyx/redis/redis_connector.py         |   60 +-
 backend/onyx/redis/redis_connector_delete.py  |   10 +-
 .../redis/redis_connector_doc_perm_sync.py    |   10 +-
 .../redis/redis_connector_ext_group_sync.py   |   10 +-
 backend/onyx/redis/redis_connector_index.py   |  154 +-
 backend/onyx/redis/redis_connector_prune.py   |   10 +-
 backend/onyx/redis/redis_connector_stop.py    |   10 +-
 backend/onyx/server/documents/cc_pair.py      |   62 +-
 backend/onyx/server/documents/connector.py    |   21 +-
 backend/onyx/server/features/persona/api.py   |    3 +-
 backend/onyx/server/manage/administrative.py  |    2 +-
 backend/onyx/server/onyx_api/ingestion.py     |   12 +-
 .../server/query_and_chat/chat_backend.py     |    5 +-
 backend/onyx/server/runtime/onyx_runtime.py   |    6 +-
 .../custom/custom_tool.py                     |   71 +-
 backend/onyx/utils/logger.py                  |    7 +
 backend/onyx/utils/threadpool_concurrency.py  |   21 +
 backend/scripts/dev_run_background_jobs.py    |   52 +-
 .../scripts/force_delete_connector_by_id.py   |    2 +-
 backend/supervisord.conf                      |   25 +-
 .../connectors/file/test_file_connector.py    |   29 +-
 .../file_store/test_file_store_non_mocked.py  |  117 +-
 .../integration/common_utils/constants.py     |    2 +-
 .../common_utils/managers/index_attempt.py    |    5 +-
 .../tests/integration/common_utils/reset.py   |   10 +
 .../connector/test_connector_creation.py      |    4 +-
 .../image_indexing/test_indexing_images.py    |    2 +-
 .../tests/indexing/test_checkpointing.py      |   13 +-
 .../indexing/test_repeated_error_state.py     |    2 +-
 .../integration/tests/pruning/test_pruning.py |    2 +-
 .../tests/unit/file_store/test_file_store.py  |   46 +-
 .../docker_compose/docker-compose.dev.yml     |    4 +-
 .../docker_compose/docker-compose.gpu-dev.yml |    3 +-
 .../docker-compose.multitenant-dev.yml        |    3 +-
 .../templates/celery-worker-docfetching.yaml  |   84 +
 ....yaml => celery-worker-docprocessing.yaml} |   37 +-
 .../celery-worker-user-files-indexing.yaml    |    2 +-
 deployment/helm/charts/onyx/values.yaml       |   43 +-
 107 files changed, 4980 insertions(+), 2605 deletions(-)
 create mode 100644 backend/alembic/versions/2f95e36923e6_add_indexing_coordination.py
 create mode 100644 backend/onyx/background/celery/apps/docfetching.py
 rename backend/onyx/background/celery/apps/{indexing.py => docprocessing.py} (91%)
 create mode 100644 backend/onyx/background/celery/configs/docfetching.py
 rename backend/onyx/background/celery/configs/{indexing.py => docprocessing.py} (89%)
 create mode 100644 backend/onyx/background/celery/tasks/docfetching/tasks.py
 create mode 100644 backend/onyx/background/celery/tasks/docprocessing/heartbeat.py
 create mode 100644 backend/onyx/background/celery/tasks/docprocessing/tasks.py
 rename backend/onyx/background/celery/tasks/{indexing => docprocessing}/utils.py (54%)
 delete mode 100644 backend/onyx/background/celery/tasks/indexing/tasks.py
 create mode 100644 backend/onyx/background/celery/tasks/models.py
 rename backend/onyx/background/celery/versioned_apps/{indexing.py => docfetching.py} (86%)
 create mode 100644 backend/onyx/background/celery/versioned_apps/docprocessing.py
 rename backend/onyx/background/indexing/{run_indexing.py => run_docfetching.py} (58%)
 create mode 100644 backend/onyx/db/indexing_coordination.py
 create mode 100644 backend/onyx/file_store/document_batch_storage.py
 create mode 100644 deployment/helm/charts/onyx/templates/celery-worker-docfetching.yaml
 rename deployment/helm/charts/onyx/templates/{celery-worker-indexing.yaml => celery-worker-docprocessing.yaml} (61%)

diff --git a/.vscode/launch.template.jsonc b/.vscode/launch.template.jsonc
index b04c3e70be5..e50bda3f5c3 100644
--- a/.vscode/launch.template.jsonc
+++ b/.vscode/launch.template.jsonc
@@ -46,7 +46,8 @@
           "Celery primary",
           "Celery light",
           "Celery heavy",
-          "Celery indexing",
+          "Celery docfetching",
+          "Celery docprocessing",
           "Celery user files indexing",
           "Celery beat",
           "Celery monitoring"
@@ -226,35 +227,66 @@
         "consoleTitle": "Celery heavy Console"
       },
       {
-        "name": "Celery indexing",
+        "name": "Celery docfetching",
         "type": "debugpy",
         "request": "launch",
         "module": "celery",
         "cwd": "${workspaceFolder}/backend",
         "envFile": "${workspaceFolder}/.vscode/.env",
         "env": {
-          "ENABLE_MULTIPASS_INDEXING": "false",
-          "LOG_LEVEL": "DEBUG",
-          "PYTHONUNBUFFERED": "1",
-          "PYTHONPATH": "."
+            "LOG_LEVEL": "DEBUG",
+            "PYTHONUNBUFFERED": "1",
+            "PYTHONPATH": "."
         },
         "args": [
-          "-A",
-          "onyx.background.celery.versioned_apps.indexing",
-          "worker",
-          "--pool=threads",
-          "--concurrency=1",
-          "--prefetch-multiplier=1",
-          "--loglevel=INFO",
-          "--hostname=indexing@%n",
-          "-Q",
-          "connector_indexing"
+            "-A",
+            "onyx.background.celery.versioned_apps.docfetching",
+            "worker",
+            "--pool=threads",
+            "--concurrency=1",
+            "--prefetch-multiplier=1",
+            "--loglevel=INFO",
+            "--hostname=docfetching@%n",
+            "-Q",
+            "connector_doc_fetching,user_files_indexing"
         ],
         "presentation": {
-          "group": "2"
+            "group": "2"
         },
-        "consoleTitle": "Celery indexing Console"
-      },
+        "consoleTitle": "Celery docfetching Console",
+        "justMyCode": false
+    },
+    {
+        "name": "Celery docprocessing",
+        "type": "debugpy",
+        "request": "launch",
+        "module": "celery",
+        "cwd": "${workspaceFolder}/backend",
+        "envFile": "${workspaceFolder}/.vscode/.env",
+        "env": {
+            "ENABLE_MULTIPASS_INDEXING": "false",
+            "LOG_LEVEL": "DEBUG",
+            "PYTHONUNBUFFERED": "1",
+            "PYTHONPATH": "."
+        },
+        "args": [
+            "-A",
+            "onyx.background.celery.versioned_apps.docprocessing",
+            "worker",
+            "--pool=threads",
+            "--concurrency=6",
+            "--prefetch-multiplier=1",
+            "--loglevel=INFO",
+            "--hostname=docprocessing@%n",
+            "-Q",
+            "docprocessing"
+        ],
+        "presentation": {
+            "group": "2"
+        },
+        "consoleTitle": "Celery docprocessing Console",
+        "justMyCode": false
+    },
       {
         "name": "Celery monitoring",
         "type": "debugpy",
@@ -303,35 +335,6 @@
         },
         "consoleTitle": "Celery beat Console"
       },
-      {
-        "name": "Celery user files indexing",
-        "type": "debugpy",
-        "request": "launch",
-        "module": "celery",
-        "cwd": "${workspaceFolder}/backend",
-        "envFile": "${workspaceFolder}/.vscode/.env",
-        "env": {
-          "LOG_LEVEL": "DEBUG",
-          "PYTHONUNBUFFERED": "1",
-          "PYTHONPATH": "."
-        },
-        "args": [
-          "-A",
-          "onyx.background.celery.versioned_apps.indexing",
-          "worker",
-          "--pool=threads",
-          "--concurrency=1",
-          "--prefetch-multiplier=1",
-          "--loglevel=INFO",
-          "--hostname=user_files_indexing@%n",
-          "-Q",
-          "user_files_indexing"
-        ],
-        "presentation": {
-          "group": "2"
-        },
-        "consoleTitle": "Celery user files indexing Console"
-      },
       {
         "name": "Pytest",
         "consoleName": "Pytest",
diff --git a/backend/alembic/versions/12635f6655b7_drive_canonical_ids.py b/backend/alembic/versions/12635f6655b7_drive_canonical_ids.py
index 9844b7bc743..e3bfe740fb9 100644
--- a/backend/alembic/versions/12635f6655b7_drive_canonical_ids.py
+++ b/backend/alembic/versions/12635f6655b7_drive_canonical_ids.py
@@ -96,7 +96,7 @@ def get_google_drive_documents_from_database() -> list[dict]:
     result = bind.execute(
         sa.text(
             """
-            SELECT d.id, cc.id as cc_pair_id
+            SELECT d.id
             FROM document d
             JOIN document_by_connector_credential_pair dcc ON d.id = dcc.id
             JOIN connector_credential_pair cc ON dcc.connector_id = cc.connector_id
@@ -109,7 +109,7 @@ def get_google_drive_documents_from_database() -> list[dict]:
 
     documents = []
     for row in result:
-        documents.append({"document_id": row.id, "cc_pair_id": row.cc_pair_id})
+        documents.append({"document_id": row.id})
 
     return documents
 
diff --git a/backend/alembic/versions/2f95e36923e6_add_indexing_coordination.py b/backend/alembic/versions/2f95e36923e6_add_indexing_coordination.py
new file mode 100644
index 00000000000..80bcc98eecb
--- /dev/null
+++ b/backend/alembic/versions/2f95e36923e6_add_indexing_coordination.py
@@ -0,0 +1,115 @@
+"""add_indexing_coordination
+
+Revision ID: 2f95e36923e6
+Revises: 0816326d83aa
+Create Date: 2025-07-10 16:17:57.762182
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "2f95e36923e6"
+down_revision = "0816326d83aa"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # Add database-based coordination fields (replacing Redis fencing)
+    op.add_column(
+        "index_attempt", sa.Column("celery_task_id", sa.String(), nullable=True)
+    )
+    op.add_column(
+        "index_attempt",
+        sa.Column(
+            "cancellation_requested",
+            sa.Boolean(),
+            nullable=False,
+            server_default="false",
+        ),
+    )
+
+    # Add batch coordination fields (replacing FileStore state)
+    op.add_column(
+        "index_attempt", sa.Column("total_batches", sa.Integer(), nullable=True)
+    )
+    op.add_column(
+        "index_attempt",
+        sa.Column(
+            "completed_batches", sa.Integer(), nullable=False, server_default="0"
+        ),
+    )
+    op.add_column(
+        "index_attempt",
+        sa.Column(
+            "total_failures_batch_level",
+            sa.Integer(),
+            nullable=False,
+            server_default="0",
+        ),
+    )
+    op.add_column(
+        "index_attempt",
+        sa.Column("total_chunks", sa.Integer(), nullable=False, server_default="0"),
+    )
+
+    # Progress tracking for stall detection
+    op.add_column(
+        "index_attempt",
+        sa.Column("last_progress_time", sa.DateTime(timezone=True), nullable=True),
+    )
+    op.add_column(
+        "index_attempt",
+        sa.Column(
+            "last_batches_completed_count",
+            sa.Integer(),
+            nullable=False,
+            server_default="0",
+        ),
+    )
+
+    # Heartbeat tracking for worker liveness detection
+    op.add_column(
+        "index_attempt",
+        sa.Column(
+            "heartbeat_counter", sa.Integer(), nullable=False, server_default="0"
+        ),
+    )
+    op.add_column(
+        "index_attempt",
+        sa.Column(
+            "last_heartbeat_value", sa.Integer(), nullable=False, server_default="0"
+        ),
+    )
+    op.add_column(
+        "index_attempt",
+        sa.Column("last_heartbeat_time", sa.DateTime(timezone=True), nullable=True),
+    )
+
+    # Add index for coordination queries
+    op.create_index(
+        "ix_index_attempt_active_coordination",
+        "index_attempt",
+        ["connector_credential_pair_id", "search_settings_id", "status"],
+    )
+
+
+def downgrade() -> None:
+    # Remove the new index
+    op.drop_index("ix_index_attempt_active_coordination", table_name="index_attempt")
+
+    # Remove the new columns
+    op.drop_column("index_attempt", "last_batches_completed_count")
+    op.drop_column("index_attempt", "last_progress_time")
+    op.drop_column("index_attempt", "last_heartbeat_time")
+    op.drop_column("index_attempt", "last_heartbeat_value")
+    op.drop_column("index_attempt", "heartbeat_counter")
+    op.drop_column("index_attempt", "total_chunks")
+    op.drop_column("index_attempt", "total_failures_batch_level")
+    op.drop_column("index_attempt", "completed_batches")
+    op.drop_column("index_attempt", "total_batches")
+    op.drop_column("index_attempt", "cancellation_requested")
+    op.drop_column("index_attempt", "celery_task_id")
diff --git a/backend/alembic/versions/c9e2cd766c29_add_s3_file_store_table.py b/backend/alembic/versions/c9e2cd766c29_add_s3_file_store_table.py
index b588d9f19f6..69717534b2b 100644
--- a/backend/alembic/versions/c9e2cd766c29_add_s3_file_store_table.py
+++ b/backend/alembic/versions/c9e2cd766c29_add_s3_file_store_table.py
@@ -159,7 +159,7 @@ def _migrate_files_to_postgres() -> None:
 
     # only create external store if we have files to migrate. This line
     # makes it so we need to have S3/MinIO configured to run this migration.
-    external_store = get_s3_file_store(db_session=session)
+    external_store = get_s3_file_store()
 
     for i, file_id in enumerate(files_to_migrate, 1):
         print(f"Migrating file {i}/{total_files}: {file_id}")
@@ -219,7 +219,7 @@ def _migrate_files_to_external_storage() -> None:
     # Get database session
     bind = op.get_bind()
     session = Session(bind=bind)
-    external_store = get_s3_file_store(db_session=session)
+    external_store = get_s3_file_store()
 
     # Find all files currently stored in PostgreSQL (lobj_oid is not null)
     result = session.execute(
diff --git a/backend/ee/onyx/background/celery/apps/heavy.py b/backend/ee/onyx/background/celery/apps/heavy.py
index 1a25a6acdc7..9494a376b10 100644
--- a/backend/ee/onyx/background/celery/apps/heavy.py
+++ b/backend/ee/onyx/background/celery/apps/heavy.py
@@ -91,7 +91,7 @@ def export_query_history_task(
     with get_session_with_current_tenant() as db_session:
         try:
             stream.seek(0)
-            get_default_file_store(db_session).save_file(
+            get_default_file_store().save_file(
                 content=stream,
                 display_name=report_name,
                 file_origin=FileOrigin.QUERY_HISTORY_CSV,
diff --git a/backend/ee/onyx/background/celery/tasks/doc_permission_syncing/tasks.py b/backend/ee/onyx/background/celery/tasks/doc_permission_syncing/tasks.py
index 72a940436db..e5c66e27a1f 100644
--- a/backend/ee/onyx/background/celery/tasks/doc_permission_syncing/tasks.py
+++ b/backend/ee/onyx/background/celery/tasks/doc_permission_syncing/tasks.py
@@ -422,7 +422,7 @@ def connector_permission_sync_generator_task(
 
     lock: RedisLock = r.lock(
         OnyxRedisLocks.CONNECTOR_DOC_PERMISSIONS_SYNC_LOCK_PREFIX
-        + f"_{redis_connector.id}",
+        + f"_{redis_connector.cc_pair_id}",
         timeout=CELERY_PERMISSIONS_SYNC_LOCK_TIMEOUT,
         thread_local=False,
     )
diff --git a/backend/ee/onyx/background/celery/tasks/external_group_syncing/tasks.py b/backend/ee/onyx/background/celery/tasks/external_group_syncing/tasks.py
index fd71165f75c..5a72241353d 100644
--- a/backend/ee/onyx/background/celery/tasks/external_group_syncing/tasks.py
+++ b/backend/ee/onyx/background/celery/tasks/external_group_syncing/tasks.py
@@ -383,7 +383,7 @@ def connector_external_group_sync_generator_task(
 
     lock: RedisLock = r.lock(
         OnyxRedisLocks.CONNECTOR_EXTERNAL_GROUP_SYNC_LOCK_PREFIX
-        + f"_{redis_connector.id}",
+        + f"_{redis_connector.cc_pair_id}",
         timeout=CELERY_EXTERNAL_GROUP_SYNC_LOCK_TIMEOUT,
     )
 
diff --git a/backend/ee/onyx/db/usage_export.py b/backend/ee/onyx/db/usage_export.py
index b2140eebe01..427c7330b71 100644
--- a/backend/ee/onyx/db/usage_export.py
+++ b/backend/ee/onyx/db/usage_export.py
@@ -114,7 +114,6 @@ def get_all_usage_reports(db_session: Session) -> list[UsageReportMetadata]:
 
 
 def get_usage_report_data(
-    db_session: Session,
     report_display_name: str,
 ) -> IO:
     """
@@ -128,7 +127,7 @@ def get_usage_report_data(
     Returns:
         The usage report data.
     """
-    file_store = get_default_file_store(db_session)
+    file_store = get_default_file_store()
     # usage report may be very large, so don't load it all into memory
     return file_store.read_file(
         file_id=report_display_name, mode="b", use_tempfile=True
diff --git a/backend/ee/onyx/server/enterprise_settings/api.py b/backend/ee/onyx/server/enterprise_settings/api.py
index 0a1856d0ef6..47675e1c9ca 100644
--- a/backend/ee/onyx/server/enterprise_settings/api.py
+++ b/backend/ee/onyx/server/enterprise_settings/api.py
@@ -134,15 +134,14 @@ def ee_fetch_settings() -> EnterpriseSettings:
 def put_logo(
     file: UploadFile,
     is_logotype: bool = False,
-    db_session: Session = Depends(get_session),
     _: User | None = Depends(current_admin_user),
 ) -> None:
-    upload_logo(file=file, db_session=db_session, is_logotype=is_logotype)
+    upload_logo(file=file, is_logotype=is_logotype)
 
 
 def fetch_logo_helper(db_session: Session) -> Response:
     try:
-        file_store = get_default_file_store(db_session)
+        file_store = get_default_file_store()
         onyx_file = file_store.get_file_with_mime_type(get_logo_filename())
         if not onyx_file:
             raise ValueError("get_onyx_file returned None!")
@@ -158,7 +157,7 @@ def fetch_logo_helper(db_session: Session) -> Response:
 
 def fetch_logotype_helper(db_session: Session) -> Response:
     try:
-        file_store = get_default_file_store(db_session)
+        file_store = get_default_file_store()
         onyx_file = file_store.get_file_with_mime_type(get_logotype_filename())
         if not onyx_file:
             raise ValueError("get_onyx_file returned None!")
diff --git a/backend/ee/onyx/server/enterprise_settings/store.py b/backend/ee/onyx/server/enterprise_settings/store.py
index 047910d2b61..788db16e94f 100644
--- a/backend/ee/onyx/server/enterprise_settings/store.py
+++ b/backend/ee/onyx/server/enterprise_settings/store.py
@@ -6,7 +6,6 @@
 
 from fastapi import HTTPException
 from fastapi import UploadFile
-from sqlalchemy.orm import Session
 
 from ee.onyx.server.enterprise_settings.models import AnalyticsScriptUpload
 from ee.onyx.server.enterprise_settings.models import EnterpriseSettings
@@ -99,9 +98,7 @@ def guess_file_type(filename: str) -> str:
     return "application/octet-stream"
 
 
-def upload_logo(
-    db_session: Session, file: UploadFile | str, is_logotype: bool = False
-) -> bool:
+def upload_logo(file: UploadFile | str, is_logotype: bool = False) -> bool:
     content: IO[Any]
 
     if isinstance(file, str):
@@ -129,7 +126,7 @@ def upload_logo(
         display_name = file.filename
         file_type = file.content_type or "image/jpeg"
 
-    file_store = get_default_file_store(db_session)
+    file_store = get_default_file_store()
     file_store.save_file(
         content=content,
         display_name=display_name,
diff --git a/backend/ee/onyx/server/query_history/api.py b/backend/ee/onyx/server/query_history/api.py
index 00cda2fa6e0..be6c8207daf 100644
--- a/backend/ee/onyx/server/query_history/api.py
+++ b/backend/ee/onyx/server/query_history/api.py
@@ -358,7 +358,7 @@ def get_query_history_export_status(
     # If task is None, then it's possible that the task has already finished processing.
     # Therefore, we should then check if the export file has already been stored inside of the file-store.
     # If that *also* doesn't exist, then we can return a 404.
-    file_store = get_default_file_store(db_session)
+    file_store = get_default_file_store()
 
     report_name = construct_query_history_report_name(request_id)
     has_file = file_store.has_file(
@@ -385,7 +385,7 @@ def download_query_history_csv(
     ensure_query_history_is_enabled(disallowed=[QueryHistoryType.DISABLED])
 
     report_name = construct_query_history_report_name(request_id)
-    file_store = get_default_file_store(db_session)
+    file_store = get_default_file_store()
     has_file = file_store.has_file(
         file_id=report_name,
         file_origin=FileOrigin.QUERY_HISTORY_CSV,
diff --git a/backend/ee/onyx/server/reporting/usage_export_api.py b/backend/ee/onyx/server/reporting/usage_export_api.py
index 0f32bd8ebe3..afac39f67f0 100644
--- a/backend/ee/onyx/server/reporting/usage_export_api.py
+++ b/backend/ee/onyx/server/reporting/usage_export_api.py
@@ -53,7 +53,7 @@ def read_usage_report(
     db_session: Session = Depends(get_session),
 ) -> Response:
     try:
-        file = get_usage_report_data(db_session, report_name)
+        file = get_usage_report_data(report_name)
     except ValueError as e:
         raise HTTPException(status_code=404, detail=str(e))
 
diff --git a/backend/ee/onyx/server/reporting/usage_export_generation.py b/backend/ee/onyx/server/reporting/usage_export_generation.py
index e61c9ab8a2a..97ec2d03c3c 100644
--- a/backend/ee/onyx/server/reporting/usage_export_generation.py
+++ b/backend/ee/onyx/server/reporting/usage_export_generation.py
@@ -112,7 +112,7 @@ def create_new_usage_report(
     period: tuple[datetime, datetime] | None,
 ) -> UsageReportMetadata:
     report_id = str(uuid.uuid4())
-    file_store = get_default_file_store(db_session)
+    file_store = get_default_file_store()
 
     messages_file_id = generate_chat_messages_report(
         db_session, file_store, report_id, period
diff --git a/backend/ee/onyx/server/seeding.py b/backend/ee/onyx/server/seeding.py
index c718c157668..cd6060b895e 100644
--- a/backend/ee/onyx/server/seeding.py
+++ b/backend/ee/onyx/server/seeding.py
@@ -200,10 +200,10 @@ def _seed_enterprise_settings(seed_config: SeedConfiguration) -> None:
         store_ee_settings(final_enterprise_settings)
 
 
-def _seed_logo(db_session: Session, logo_path: str | None) -> None:
+def _seed_logo(logo_path: str | None) -> None:
     if logo_path:
         logger.notice("Uploading logo")
-        upload_logo(db_session=db_session, file=logo_path)
+        upload_logo(file=logo_path)
 
 
 def _seed_analytics_script(seed_config: SeedConfiguration) -> None:
@@ -245,7 +245,7 @@ def seed_db() -> None:
         if seed_config.custom_tools is not None:
             _seed_custom_tools(db_session, seed_config.custom_tools)
 
-        _seed_logo(db_session, seed_config.seeded_logo_path)
+        _seed_logo(seed_config.seeded_logo_path)
         _seed_enterprise_settings(seed_config)
         _seed_analytics_script(seed_config)
 
diff --git a/backend/onyx/background/celery/apps/app_base.py b/backend/onyx/background/celery/apps/app_base.py
index 59ecf1cba59..c207b086b6a 100644
--- a/backend/onyx/background/celery/apps/app_base.py
+++ b/backend/onyx/background/celery/apps/app_base.py
@@ -40,6 +40,7 @@
 from onyx.redis.redis_pool import get_redis_client
 from onyx.redis.redis_usergroup import RedisUserGroup
 from onyx.utils.logger import ColoredFormatter
+from onyx.utils.logger import LoggerContextVars
 from onyx.utils.logger import PlainFormatter
 from onyx.utils.logger import setup_logger
 from shared_configs.configs import MULTI_TENANT
@@ -93,7 +94,13 @@ def on_task_prerun(
     kwargs: dict[str, Any] | None = None,
     **other_kwargs: Any,
 ) -> None:
-    pass
+    # Reset any per-task logging context so that prefixes (e.g. pruning_ctx)
+    # from a previous task executed in the same worker process do not leak
+    # into the next task's log messages. This fixes incorrect [CC Pair:/Index Attempt]
+    # prefixes observed when a pruning task finishes and an indexing task
+    # runs in the same process.
+
+    LoggerContextVars.reset()
 
 
 def on_task_postrun(
@@ -474,7 +481,8 @@ def filter(self, record: logging.LogRecord) -> bool:
 
         tenant_id = CURRENT_TENANT_ID_CONTEXTVAR.get()
         if tenant_id:
-            tenant_id = tenant_id.split(TENANT_ID_PREFIX)[-1][:5]
+            # Match the 8 character tenant abbreviation used in OnyxLoggingAdapter
+            tenant_id = tenant_id.split(TENANT_ID_PREFIX)[-1][:8]
             record.name = f"[t:{tenant_id}]"
         else:
             record.name = ""
diff --git a/backend/onyx/background/celery/apps/docfetching.py b/backend/onyx/background/celery/apps/docfetching.py
new file mode 100644
index 00000000000..022b5fee9d4
--- /dev/null
+++ b/backend/onyx/background/celery/apps/docfetching.py
@@ -0,0 +1,102 @@
+from typing import Any
+from typing import cast
+
+from celery import Celery
+from celery import signals
+from celery import Task
+from celery.apps.worker import Worker
+from celery.signals import celeryd_init
+from celery.signals import worker_init
+from celery.signals import worker_ready
+from celery.signals import worker_shutdown
+
+import onyx.background.celery.apps.app_base as app_base
+from onyx.configs.constants import POSTGRES_CELERY_WORKER_DOCFETCHING_APP_NAME
+from onyx.db.engine.sql_engine import SqlEngine
+from onyx.utils.logger import setup_logger
+from shared_configs.configs import MULTI_TENANT
+
+
+logger = setup_logger()
+
+celery_app = Celery(__name__)
+celery_app.config_from_object("onyx.background.celery.configs.docfetching")
+celery_app.Task = app_base.TenantAwareTask  # type: ignore [misc]
+
+
+@signals.task_prerun.connect
+def on_task_prerun(
+    sender: Any | None = None,
+    task_id: str | None = None,
+    task: Task | None = None,
+    args: tuple | None = None,
+    kwargs: dict | None = None,
+    **kwds: Any,
+) -> None:
+    app_base.on_task_prerun(sender, task_id, task, args, kwargs, **kwds)
+
+
+@signals.task_postrun.connect
+def on_task_postrun(
+    sender: Any | None = None,
+    task_id: str | None = None,
+    task: Task | None = None,
+    args: tuple | None = None,
+    kwargs: dict | None = None,
+    retval: Any | None = None,
+    state: str | None = None,
+    **kwds: Any,
+) -> None:
+    app_base.on_task_postrun(sender, task_id, task, args, kwargs, retval, state, **kwds)
+
+
+@celeryd_init.connect
+def on_celeryd_init(sender: str, conf: Any = None, **kwargs: Any) -> None:
+    app_base.on_celeryd_init(sender, conf, **kwargs)
+
+
+@worker_init.connect
+def on_worker_init(sender: Worker, **kwargs: Any) -> None:
+    logger.info("worker_init signal received.")
+
+    SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_DOCFETCHING_APP_NAME)
+    pool_size = cast(int, sender.concurrency)  # type: ignore
+    SqlEngine.init_engine(pool_size=pool_size, max_overflow=8)
+
+    app_base.wait_for_redis(sender, **kwargs)
+    app_base.wait_for_db(sender, **kwargs)
+    app_base.wait_for_vespa_or_shutdown(sender, **kwargs)
+
+    # Less startup checks in multi-tenant case
+    if MULTI_TENANT:
+        return
+
+    app_base.on_secondary_worker_init(sender, **kwargs)
+
+
+@worker_ready.connect
+def on_worker_ready(sender: Any, **kwargs: Any) -> None:
+    app_base.on_worker_ready(sender, **kwargs)
+
+
+@worker_shutdown.connect
+def on_worker_shutdown(sender: Any, **kwargs: Any) -> None:
+    app_base.on_worker_shutdown(sender, **kwargs)
+
+
+@signals.setup_logging.connect
+def on_setup_logging(
+    loglevel: Any, logfile: Any, format: Any, colorize: Any, **kwargs: Any
+) -> None:
+    app_base.on_setup_logging(loglevel, logfile, format, colorize, **kwargs)
+
+
+base_bootsteps = app_base.get_bootsteps()
+for bootstep in base_bootsteps:
+    celery_app.steps["worker"].add(bootstep)
+
+celery_app.autodiscover_tasks(
+    [
+        "onyx.background.celery.tasks.docfetching",
+    ]
+)
diff --git a/backend/onyx/background/celery/apps/indexing.py b/backend/onyx/background/celery/apps/docprocessing.py
similarity index 91%
rename from backend/onyx/background/celery/apps/indexing.py
rename to backend/onyx/background/celery/apps/docprocessing.py
index ac56b853692..8900fedcd26 100644
--- a/backend/onyx/background/celery/apps/indexing.py
+++ b/backend/onyx/background/celery/apps/docprocessing.py
@@ -12,7 +12,7 @@
 from celery.signals import worker_shutdown
 
 import onyx.background.celery.apps.app_base as app_base
-from onyx.configs.constants import POSTGRES_CELERY_WORKER_INDEXING_APP_NAME
+from onyx.configs.constants import POSTGRES_CELERY_WORKER_DOCPROCESSING_APP_NAME
 from onyx.db.engine.sql_engine import SqlEngine
 from onyx.utils.logger import setup_logger
 from shared_configs.configs import MULTI_TENANT
@@ -21,7 +21,7 @@
 logger = setup_logger()
 
 celery_app = Celery(__name__)
-celery_app.config_from_object("onyx.background.celery.configs.indexing")
+celery_app.config_from_object("onyx.background.celery.configs.docprocessing")
 celery_app.Task = app_base.TenantAwareTask  # type: ignore [misc]
 
 
@@ -60,7 +60,7 @@ def on_celeryd_init(sender: str, conf: Any = None, **kwargs: Any) -> None:
 def on_worker_init(sender: Worker, **kwargs: Any) -> None:
     logger.info("worker_init signal received.")
 
-    SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_INDEXING_APP_NAME)
+    SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_DOCPROCESSING_APP_NAME)
 
     # rkuo: Transient errors keep happening in the indexing watchdog threads.
     # "SSL connection has been closed unexpectedly"
@@ -108,6 +108,6 @@ def on_setup_logging(
 
 celery_app.autodiscover_tasks(
     [
-        "onyx.background.celery.tasks.indexing",
+        "onyx.background.celery.tasks.docprocessing",
     ]
 )
diff --git a/backend/onyx/background/celery/apps/light.py b/backend/onyx/background/celery/apps/light.py
index e9ac4e36c3b..af981f85eea 100644
--- a/backend/onyx/background/celery/apps/light.py
+++ b/backend/onyx/background/celery/apps/light.py
@@ -116,6 +116,6 @@ def on_setup_logging(
         "onyx.background.celery.tasks.connector_deletion",
         "onyx.background.celery.tasks.doc_permission_syncing",
         "onyx.background.celery.tasks.user_file_folder_sync",
-        "onyx.background.celery.tasks.indexing",
+        "onyx.background.celery.tasks.docprocessing",
     ]
 )
diff --git a/backend/onyx/background/celery/apps/primary.py b/backend/onyx/background/celery/apps/primary.py
index e63546a7488..bd10911b9be 100644
--- a/backend/onyx/background/celery/apps/primary.py
+++ b/backend/onyx/background/celery/apps/primary.py
@@ -9,6 +9,7 @@
 from celery import Task
 from celery.apps.worker import Worker
 from celery.exceptions import WorkerShutdown
+from celery.result import AsyncResult
 from celery.signals import celeryd_init
 from celery.signals import worker_init
 from celery.signals import worker_ready
@@ -18,9 +19,6 @@
 import onyx.background.celery.apps.app_base as app_base
 from onyx.background.celery.apps.app_base import task_logger
 from onyx.background.celery.celery_utils import celery_is_worker_primary
-from onyx.background.celery.tasks.indexing.utils import (
-    get_unfenced_index_attempt_ids,
-)
 from onyx.background.celery.tasks.vespa.document_sync import reset_document_sync
 from onyx.configs.constants import CELERY_PRIMARY_WORKER_LOCK_TIMEOUT
 from onyx.configs.constants import OnyxRedisConstants
@@ -30,6 +28,7 @@
 from onyx.db.engine.sql_engine import SqlEngine
 from onyx.db.index_attempt import get_index_attempt
 from onyx.db.index_attempt import mark_attempt_canceled
+from onyx.db.indexing_coordination import IndexingCoordination
 from onyx.redis.redis_connector_delete import RedisConnectorDelete
 from onyx.redis.redis_connector_doc_perm_sync import RedisConnectorPermissionSync
 from onyx.redis.redis_connector_ext_group_sync import RedisConnectorExternalGroupSync
@@ -168,24 +167,50 @@ def on_worker_init(sender: Worker, **kwargs: Any) -> None:
     RedisConnectorExternalGroupSync.reset_all(r)
 
     # mark orphaned index attempts as failed
+    # This uses database coordination instead of Redis fencing
     with get_session_with_current_tenant() as db_session:
-        unfenced_attempt_ids = get_unfenced_index_attempt_ids(db_session, r)
-        for attempt_id in unfenced_attempt_ids:
+        # Get potentially orphaned attempts (those with active status and task IDs)
+        potentially_orphaned_ids = IndexingCoordination.get_orphaned_index_attempt_ids(
+            db_session
+        )
+
+        for attempt_id in potentially_orphaned_ids:
             attempt = get_index_attempt(db_session, attempt_id)
-            if not attempt:
+
+            # handle case where not started or docfetching is done but indexing is not
+            if (
+                not attempt
+                or not attempt.celery_task_id
+                or attempt.total_batches is not None
+            ):
                 continue
 
-            failure_reason = (
-                f"Canceling leftover index attempt found on startup: "
-                f"index_attempt={attempt.id} "
-                f"cc_pair={attempt.connector_credential_pair_id} "
-                f"search_settings={attempt.search_settings_id}"
-            )
-            logger.warning(failure_reason)
-            logger.exception(
-                f"Marking attempt {attempt.id} as canceled due to validation error 2"
-            )
-            mark_attempt_canceled(attempt.id, db_session, failure_reason)
+            # Check if the Celery task actually exists
+            try:
+
+                result: AsyncResult = AsyncResult(attempt.celery_task_id)
+
+                # If the task is not in PENDING state, it exists in Celery
+                if result.state != "PENDING":
+                    continue
+
+                # Task is orphaned - mark as failed
+                failure_reason = (
+                    f"Orphaned index attempt found on startup - Celery task not found: "
+                    f"index_attempt={attempt.id} "
+                    f"cc_pair={attempt.connector_credential_pair_id} "
+                    f"search_settings={attempt.search_settings_id} "
+                    f"celery_task_id={attempt.celery_task_id}"
+                )
+                logger.warning(failure_reason)
+                mark_attempt_canceled(attempt.id, db_session, failure_reason)
+
+            except Exception:
+                # If we can't check the task status, be conservative and continue
+                logger.warning(
+                    f"Could not verify Celery task status on startup for attempt {attempt.id}, "
+                    f"task_id={attempt.celery_task_id}"
+                )
 
 
 @worker_ready.connect
@@ -292,7 +317,7 @@ def stop(self, worker: Any) -> None:
 celery_app.autodiscover_tasks(
     [
         "onyx.background.celery.tasks.connector_deletion",
-        "onyx.background.celery.tasks.indexing",
+        "onyx.background.celery.tasks.docprocessing",
         "onyx.background.celery.tasks.periodic",
         "onyx.background.celery.tasks.pruning",
         "onyx.background.celery.tasks.shared",
diff --git a/backend/onyx/background/celery/celery_redis.py b/backend/onyx/background/celery/celery_redis.py
index 143c661d892..5454ac8e487 100644
--- a/backend/onyx/background/celery/celery_redis.py
+++ b/backend/onyx/background/celery/celery_redis.py
@@ -26,7 +26,7 @@ def celery_get_unacked_length(r: Redis) -> int:
 def celery_get_unacked_task_ids(queue: str, r: Redis) -> set[str]:
     """Gets the set of task id's matching the given queue in the unacked hash.
 
-    Unacked entries belonging to the indexing queue are "prefetched", so this gives
+    Unacked entries belonging to the indexing queues are "prefetched", so this gives
     us crucial visibility as to what tasks are in that state.
     """
     tasks: set[str] = set()
diff --git a/backend/onyx/background/celery/configs/docfetching.py b/backend/onyx/background/celery/configs/docfetching.py
new file mode 100644
index 00000000000..a34006247b7
--- /dev/null
+++ b/backend/onyx/background/celery/configs/docfetching.py
@@ -0,0 +1,22 @@
+import onyx.background.celery.configs.base as shared_config
+from onyx.configs.app_configs import CELERY_WORKER_DOCFETCHING_CONCURRENCY
+
+broker_url = shared_config.broker_url
+broker_connection_retry_on_startup = shared_config.broker_connection_retry_on_startup
+broker_pool_limit = shared_config.broker_pool_limit
+broker_transport_options = shared_config.broker_transport_options
+
+redis_socket_keepalive = shared_config.redis_socket_keepalive
+redis_retry_on_timeout = shared_config.redis_retry_on_timeout
+redis_backend_health_check_interval = shared_config.redis_backend_health_check_interval
+
+result_backend = shared_config.result_backend
+result_expires = shared_config.result_expires  # 86400 seconds is the default
+
+task_default_priority = shared_config.task_default_priority
+task_acks_late = shared_config.task_acks_late
+
+# Docfetching worker configuration
+worker_concurrency = CELERY_WORKER_DOCFETCHING_CONCURRENCY
+worker_pool = "threads"
+worker_prefetch_multiplier = 1
diff --git a/backend/onyx/background/celery/configs/indexing.py b/backend/onyx/background/celery/configs/docprocessing.py
similarity index 89%
rename from backend/onyx/background/celery/configs/indexing.py
rename to backend/onyx/background/celery/configs/docprocessing.py
index 244a07f7a25..6d5f8ed20b1 100644
--- a/backend/onyx/background/celery/configs/indexing.py
+++ b/backend/onyx/background/celery/configs/docprocessing.py
@@ -1,5 +1,5 @@
 import onyx.background.celery.configs.base as shared_config
-from onyx.configs.app_configs import CELERY_WORKER_INDEXING_CONCURRENCY
+from onyx.configs.app_configs import CELERY_WORKER_DOCPROCESSING_CONCURRENCY
 
 broker_url = shared_config.broker_url
 broker_connection_retry_on_startup = shared_config.broker_connection_retry_on_startup
@@ -24,6 +24,6 @@
 # which means a duplicate run might change the task state unexpectedly
 # task_track_started = True
 
-worker_concurrency = CELERY_WORKER_INDEXING_CONCURRENCY
+worker_concurrency = CELERY_WORKER_DOCPROCESSING_CONCURRENCY
 worker_pool = "threads"
 worker_prefetch_multiplier = 1
diff --git a/backend/onyx/background/celery/tasks/connector_deletion/tasks.py b/backend/onyx/background/celery/tasks/connector_deletion/tasks.py
index 6e008f5399c..2379ebd2a0f 100644
--- a/backend/onyx/background/celery/tasks/connector_deletion/tasks.py
+++ b/backend/onyx/background/celery/tasks/connector_deletion/tasks.py
@@ -40,9 +40,11 @@
 from onyx.db.document_set import delete_document_set_cc_pair_relationship__no_commit
 from onyx.db.engine.sql_engine import get_session_with_current_tenant
 from onyx.db.enums import ConnectorCredentialPairStatus
+from onyx.db.enums import IndexingStatus
 from onyx.db.enums import SyncStatus
 from onyx.db.enums import SyncType
 from onyx.db.index_attempt import delete_index_attempts
+from onyx.db.index_attempt import get_recent_attempts_for_cc_pair
 from onyx.db.search_settings import get_all_search_settings
 from onyx.db.sync_record import cleanup_sync_records
 from onyx.db.sync_record import insert_sync_record
@@ -69,13 +71,21 @@ def revoke_tasks_blocking_deletion(
 ) -> None:
     search_settings_list = get_all_search_settings(db_session)
     for search_settings in search_settings_list:
-        redis_connector_index = redis_connector.new_index(search_settings.id)
         try:
-            index_payload = redis_connector_index.payload
-            if index_payload and index_payload.celery_task_id:
-                app.control.revoke(index_payload.celery_task_id)
+            recent_index_attempts = get_recent_attempts_for_cc_pair(
+                cc_pair_id=redis_connector.cc_pair_id,
+                search_settings_id=search_settings.id,
+                limit=1,
+                db_session=db_session,
+            )
+            if (
+                recent_index_attempts
+                and recent_index_attempts[0].status == IndexingStatus.IN_PROGRESS
+                and recent_index_attempts[0].celery_task_id
+            ):
+                app.control.revoke(recent_index_attempts[0].celery_task_id)
                 task_logger.info(
-                    f"Revoked indexing task {index_payload.celery_task_id}."
+                    f"Revoked indexing task {recent_index_attempts[0].celery_task_id}."
                 )
         except Exception:
             task_logger.exception("Exception while revoking indexing task")
@@ -281,8 +291,16 @@ def try_generate_document_cc_pair_cleanup_tasks(
         # do not proceed if connector indexing or connector pruning are running
         search_settings_list = get_all_search_settings(db_session)
         for search_settings in search_settings_list:
-            redis_connector_index = redis_connector.new_index(search_settings.id)
-            if redis_connector_index.fenced:
+            recent_index_attempts = get_recent_attempts_for_cc_pair(
+                cc_pair_id=cc_pair_id,
+                search_settings_id=search_settings.id,
+                limit=1,
+                db_session=db_session,
+            )
+            if (
+                recent_index_attempts
+                and recent_index_attempts[0].status == IndexingStatus.IN_PROGRESS
+            ):
                 raise TaskDependencyError(
                     "Connector deletion - Delayed (indexing in progress): "
                     f"cc_pair={cc_pair_id} "
diff --git a/backend/onyx/background/celery/tasks/docfetching/tasks.py b/backend/onyx/background/celery/tasks/docfetching/tasks.py
new file mode 100644
index 00000000000..1dbd5b63b03
--- /dev/null
+++ b/backend/onyx/background/celery/tasks/docfetching/tasks.py
@@ -0,0 +1,675 @@
+import multiprocessing
+import os
+import time
+import traceback
+from http import HTTPStatus
+from time import sleep
+
+import sentry_sdk
+from celery import Celery
+from celery import shared_task
+from celery import Task
+from redis.lock import Lock as RedisLock
+
+from onyx.background.celery.apps.app_base import task_logger
+from onyx.background.celery.memory_monitoring import emit_process_memory
+from onyx.background.celery.tasks.docprocessing.heartbeat import start_heartbeat
+from onyx.background.celery.tasks.docprocessing.heartbeat import stop_heartbeat
+from onyx.background.celery.tasks.docprocessing.tasks import ConnectorIndexingLogBuilder
+from onyx.background.celery.tasks.docprocessing.utils import IndexingCallback
+from onyx.background.celery.tasks.models import DocProcessingContext
+from onyx.background.celery.tasks.models import IndexingWatchdogTerminalStatus
+from onyx.background.celery.tasks.models import SimpleJobResult
+from onyx.background.indexing.job_client import SimpleJob
+from onyx.background.indexing.job_client import SimpleJobClient
+from onyx.background.indexing.job_client import SimpleJobException
+from onyx.background.indexing.run_docfetching import run_indexing_entrypoint
+from onyx.configs.constants import CELERY_INDEXING_LOCK_TIMEOUT
+from onyx.configs.constants import CELERY_INDEXING_WATCHDOG_CONNECTOR_TIMEOUT
+from onyx.configs.constants import OnyxCeleryTask
+from onyx.connectors.exceptions import ConnectorValidationError
+from onyx.db.connector_credential_pair import get_connector_credential_pair_from_id
+from onyx.db.engine.sql_engine import get_session_with_current_tenant
+from onyx.db.enums import IndexingStatus
+from onyx.db.index_attempt import get_index_attempt
+from onyx.db.index_attempt import mark_attempt_canceled
+from onyx.db.index_attempt import mark_attempt_failed
+from onyx.db.indexing_coordination import IndexingCoordination
+from onyx.redis.redis_connector import RedisConnector
+from onyx.redis.redis_connector_index import RedisConnectorIndex
+from onyx.redis.redis_pool import get_redis_client
+from onyx.utils.logger import setup_logger
+from onyx.utils.variable_functionality import global_version
+from shared_configs.configs import SENTRY_DSN
+
+logger = setup_logger()
+
+
+def _verify_indexing_attempt(
+    index_attempt_id: int,
+    cc_pair_id: int,
+    search_settings_id: int,
+) -> None:
+    """
+    Verify that the indexing attempt exists and is in the correct state.
+    """
+
+    with get_session_with_current_tenant() as db_session:
+        attempt = get_index_attempt(db_session, index_attempt_id)
+
+        if not attempt:
+            raise SimpleJobException(
+                f"docfetching_task - IndexAttempt not found: attempt_id={index_attempt_id}",
+                code=IndexingWatchdogTerminalStatus.FENCE_NOT_FOUND.code,
+            )
+
+        if attempt.connector_credential_pair_id != cc_pair_id:
+            raise SimpleJobException(
+                f"docfetching_task - CC pair mismatch: "
+                f"expected={cc_pair_id} actual={attempt.connector_credential_pair_id}",
+                code=IndexingWatchdogTerminalStatus.FENCE_MISMATCH.code,
+            )
+
+        if attempt.search_settings_id != search_settings_id:
+            raise SimpleJobException(
+                f"docfetching_task - Search settings mismatch: "
+                f"expected={search_settings_id} actual={attempt.search_settings_id}",
+                code=IndexingWatchdogTerminalStatus.FENCE_MISMATCH.code,
+            )
+
+        if attempt.status not in [
+            IndexingStatus.NOT_STARTED,
+            IndexingStatus.IN_PROGRESS,
+        ]:
+            raise SimpleJobException(
+                f"docfetching_task - Invalid attempt status: "
+                f"attempt_id={index_attempt_id} status={attempt.status}",
+                code=IndexingWatchdogTerminalStatus.FENCE_MISMATCH.code,
+            )
+
+        # Check for cancellation
+        if IndexingCoordination.check_cancellation_requested(
+            db_session, index_attempt_id
+        ):
+            raise SimpleJobException(
+                f"docfetching_task - Cancellation requested: attempt_id={index_attempt_id}",
+                code=IndexingWatchdogTerminalStatus.BLOCKED_BY_STOP_SIGNAL.code,
+            )
+
+    logger.info(
+        f"docfetching_task - IndexAttempt verified: "
+        f"attempt_id={index_attempt_id} "
+        f"cc_pair={cc_pair_id} "
+        f"search_settings={search_settings_id}"
+    )
+
+
+def docfetching_task(
+    app: Celery,
+    index_attempt_id: int,
+    cc_pair_id: int,
+    search_settings_id: int,
+    is_ee: bool,
+    tenant_id: str,
+) -> None:
+    """
+    This function is run in a SimpleJob as a new process. It is responsible for validating
+    some stuff, but basically it just calls run_indexing_entrypoint.
+
+    NOTE: if an exception is raised out of this task, the primary worker will detect
+    that the task transitioned to a "READY" state but the generator_complete_key doesn't exist.
+    This will cause the primary worker to abort the indexing attempt and clean up.
+    """
+
+    # Start heartbeat for this indexing attempt
+    heartbeat_thread, stop_event = start_heartbeat(index_attempt_id)
+    try:
+        _docfetching_task(
+            app, index_attempt_id, cc_pair_id, search_settings_id, is_ee, tenant_id
+        )
+    finally:
+        stop_heartbeat(heartbeat_thread, stop_event)  # Stop heartbeat before exiting
+
+
+def _docfetching_task(
+    app: Celery,
+    index_attempt_id: int,
+    cc_pair_id: int,
+    search_settings_id: int,
+    is_ee: bool,
+    tenant_id: str,
+) -> None:
+    # Since connector_indexing_proxy_task spawns a new process using this function as
+    # the entrypoint, we init Sentry here.
+    if SENTRY_DSN:
+        sentry_sdk.init(
+            dsn=SENTRY_DSN,
+            traces_sample_rate=0.1,
+        )
+        logger.info("Sentry initialized")
+    else:
+        logger.debug("Sentry DSN not provided, skipping Sentry initialization")
+
+    logger.info(
+        f"Indexing spawned task starting: "
+        f"attempt={index_attempt_id} "
+        f"tenant={tenant_id} "
+        f"cc_pair={cc_pair_id} "
+        f"search_settings={search_settings_id}"
+    )
+
+    redis_connector = RedisConnector(tenant_id, cc_pair_id)
+    redis_connector_index = redis_connector.new_index(search_settings_id)
+
+    # TODO: remove all fences, cause all signals to be set in postgres
+    if redis_connector.delete.fenced:
+        raise SimpleJobException(
+            f"Indexing will not start because connector deletion is in progress: "
+            f"attempt={index_attempt_id} "
+            f"cc_pair={cc_pair_id} "
+            f"fence={redis_connector.delete.fence_key}",
+            code=IndexingWatchdogTerminalStatus.BLOCKED_BY_DELETION.code,
+        )
+
+    if redis_connector.stop.fenced:
+        raise SimpleJobException(
+            f"Indexing will not start because a connector stop signal was detected: "
+            f"attempt={index_attempt_id} "
+            f"cc_pair={cc_pair_id} "
+            f"fence={redis_connector.stop.fence_key}",
+            code=IndexingWatchdogTerminalStatus.BLOCKED_BY_STOP_SIGNAL.code,
+        )
+
+    # Verify the indexing attempt exists and is valid
+    # This replaces the Redis fence payload waiting
+    _verify_indexing_attempt(index_attempt_id, cc_pair_id, search_settings_id)
+
+    # We still need a basic Redis lock to prevent duplicate task execution
+    # but this is much simpler than the full fencing mechanism
+    r = get_redis_client()
+    # set thread_local=False since we don't control what thread the indexing/pruning
+    # might run our callback with
+    lock: RedisLock = r.lock(
+        redis_connector_index.generator_lock_key,
+        timeout=CELERY_INDEXING_LOCK_TIMEOUT,
+        thread_local=False,
+    )
+
+    acquired = lock.acquire(blocking=False)
+    if not acquired:
+        logger.warning(
+            f"Docfetching task already running, exiting...: "
+            f"index_attempt={index_attempt_id} "
+            f"cc_pair={cc_pair_id} "
+            f"search_settings={search_settings_id}"
+        )
+
+        raise SimpleJobException(
+            f"Docfetching task already running, exiting...: "
+            f"index_attempt={index_attempt_id} "
+            f"cc_pair={cc_pair_id} "
+            f"search_settings={search_settings_id}",
+            code=IndexingWatchdogTerminalStatus.TASK_ALREADY_RUNNING.code,
+        )
+
+    try:
+        with get_session_with_current_tenant() as db_session:
+            attempt = get_index_attempt(db_session, index_attempt_id)
+            if not attempt:
+                raise SimpleJobException(
+                    f"Index attempt not found: index_attempt={index_attempt_id}",
+                    code=IndexingWatchdogTerminalStatus.INDEX_ATTEMPT_MISMATCH.code,
+                )
+
+            cc_pair = get_connector_credential_pair_from_id(
+                db_session=db_session,
+                cc_pair_id=cc_pair_id,
+            )
+
+            if not cc_pair:
+                raise SimpleJobException(
+                    f"cc_pair not found: cc_pair={cc_pair_id}",
+                    code=IndexingWatchdogTerminalStatus.INDEX_ATTEMPT_MISMATCH.code,
+                )
+
+        # define a callback class
+        callback = IndexingCallback(
+            os.getppid(),
+            redis_connector,
+            lock,
+            r,
+        )
+
+        logger.info(
+            f"Indexing spawned task running entrypoint: attempt={index_attempt_id} "
+            f"tenant={tenant_id} "
+            f"cc_pair={cc_pair_id} "
+            f"search_settings={search_settings_id}"
+        )
+
+        # This is where the heavy/real work happens
+        run_indexing_entrypoint(
+            app,
+            index_attempt_id,
+            tenant_id,
+            cc_pair_id,
+            is_ee,
+            callback=callback,
+        )
+
+    except ConnectorValidationError:
+        raise SimpleJobException(
+            f"Indexing task failed: attempt={index_attempt_id} "
+            f"tenant={tenant_id} "
+            f"cc_pair={cc_pair_id} "
+            f"search_settings={search_settings_id}",
+            code=IndexingWatchdogTerminalStatus.CONNECTOR_VALIDATION_ERROR.code,
+        )
+
+    except Exception as e:
+        logger.exception(
+            f"Indexing spawned task failed: attempt={index_attempt_id} "
+            f"tenant={tenant_id} "
+            f"cc_pair={cc_pair_id} "
+            f"search_settings={search_settings_id}"
+        )
+
+        # special bulletproofing ... truncate long exception messages
+        # for exception types that require more args, this will fail
+        # thus the try/except
+        try:
+            sanitized_e = type(e)(str(e)[:1024])
+            sanitized_e.__traceback__ = e.__traceback__
+            raise sanitized_e
+        except Exception:
+            raise e
+
+    finally:
+        if lock.owned():
+            lock.release()
+
+    logger.info(
+        f"Indexing spawned task finished: attempt={index_attempt_id} "
+        f"cc_pair={cc_pair_id} "
+        f"search_settings={search_settings_id}"
+    )
+    os._exit(0)  # ensure process exits cleanly
+
+
+def process_job_result(
+    job: SimpleJob,
+    connector_source: str | None,
+    redis_connector_index: RedisConnectorIndex,
+    log_builder: ConnectorIndexingLogBuilder,
+) -> SimpleJobResult:
+    result = SimpleJobResult()
+    result.connector_source = connector_source
+
+    if job.process:
+        result.exit_code = job.process.exitcode
+
+    if job.status != "error":
+        result.status = IndexingWatchdogTerminalStatus.SUCCEEDED
+        return result
+
+    ignore_exitcode = False
+
+    # In EKS, there is an edge case where successful tasks return exit
+    # code 1 in the cloud due to the set_spawn_method not sticking.
+    # We've since worked around this, but the following is a safe way to
+    # work around this issue. Basically, we ignore the job error state
+    # if the completion signal is OK.
+    status_int = redis_connector_index.get_completion()
+    if status_int:
+        status_enum = HTTPStatus(status_int)
+        if status_enum == HTTPStatus.OK:
+            ignore_exitcode = True
+
+    if ignore_exitcode:
+        result.status = IndexingWatchdogTerminalStatus.SUCCEEDED
+        task_logger.warning(
+            log_builder.build(
+                "Indexing watchdog - spawned task has non-zero exit code "
+                "but completion signal is OK. Continuing...",
+                exit_code=str(result.exit_code),
+            )
+        )
+    else:
+        if result.exit_code is not None:
+            result.status = IndexingWatchdogTerminalStatus.from_code(result.exit_code)
+
+        result.exception_str = job.exception()
+
+    return result
+
+
+@shared_task(
+    name=OnyxCeleryTask.CONNECTOR_DOC_FETCHING_TASK,
+    bind=True,
+    acks_late=False,
+    track_started=True,
+)
+def docfetching_proxy_task(
+    self: Task,
+    index_attempt_id: int,
+    cc_pair_id: int,
+    search_settings_id: int,
+    tenant_id: str,
+) -> None:
+    """
+    This task is the entrypoint for the full indexing pipeline, which is composed of two tasks:
+    docfetching and docprocessing.
+    This task is spawned by "try_creating_indexing_task" which is called in the "check_for_indexing" task.
+
+    This task spawns a new process for a new scheduled index attempt. That
+    new process (which runs the docfetching_task function) does the following:
+
+    1)  determines parameters of the indexing attempt (which connector indexing function to run,
+        start and end time, from prev checkpoint or not), then run that connector. Specifically,
+        connectors are responsible for reading data from an outside source and converting it to Onyx documents.
+        At the moment these two steps (reading external data and converting to an Onyx document)
+        are not parallelized in most connectors; that's a subject for future work.
+
+    Each document batch produced by step 1 is stored in the file store, and a docprocessing task is spawned
+    to process it. docprocessing involves the steps listed below.
+
+    2) upserts documents to postgres (index_doc_batch_prepare)
+    3) chunks each document (optionally adds context for contextual rag)
+    4) embeds chunks (embed_chunks_with_failure_handling) via a call to the model server
+    5) write chunks to vespa (write_chunks_to_vector_db_with_backoff)
+    6) update document and indexing metadata in postgres
+    7) pulls all document IDs from the source and compares those IDs to locally stored documents and deletes
+    all locally stored IDs missing from the most recently pulled document ID list
+
+    Some important notes:
+    Invariants:
+    - docfetching proxy tasks are spawned by check_for_indexing. The proxy then runs the docfetching_task wrapped in a watchdog.
+      The watchdog is responsible for monitoring the docfetching_task and marking the index attempt as failed
+      if it is not making progress.
+    - All docprocessing tasks are spawned by a docfetching task.
+    - all docfetching tasks, docprocessing tasks, and document batches in the file store are
+      associated with a specific index attempt.
+    - the index attempt status is the source of truth for what is currently happening with the index attempt.
+      It is coupled with the creation/running of docfetching and docprocessing tasks as much as possible.
+
+    How we deal with failures/ partial indexing:
+    - non-checkpointed connectors/ new runs in general => delete the old document batches from the file store and do the new run
+    - checkpointed connectors + resuming from checkpoint => reissue the old document batches and do a new run
+
+    Misc:
+    - most inter-process communication is handled in postgres, some is still in redis and we're trying to remove it
+    - Heartbeat spawned in docfetching and docprocessing is how check_for_indexing monitors liveliness
+    - progress based liveliness check: if nothing is done in 3-6 hours, mark the attempt as failed
+    - TODO: task level timeouts (i.e. a connector stuck in an infinite loop)
+
+
+    Comments below are from the old version and some may no longer be valid.
+    TODO(rkuo): refactor this so that there is a single return path where we canonically
+    log the result of running this function.
+
+    Some more Richard notes:
+    celery out of process task execution strategy is pool=prefork, but it uses fork,
+    and forking is inherently unstable.
+
+    To work around this, we use pool=threads and proxy our work to a spawned task.
+
+    acks_late must be set to False. Otherwise, celery's visibility timeout will
+    cause any task that runs longer than the timeout to be redispatched by the broker.
+    There appears to be no good workaround for this, so we need to handle redispatching
+    manually.
+    NOTE: we try/except all db access in this function because as a watchdog, this function
+    needs to be extremely stable.
+    """
+    # TODO: remove dependence on Redis
+    start = time.monotonic()
+
+    result = SimpleJobResult()
+
+    ctx = DocProcessingContext(
+        tenant_id=tenant_id,
+        cc_pair_id=cc_pair_id,
+        search_settings_id=search_settings_id,
+        index_attempt_id=index_attempt_id,
+    )
+
+    log_builder = ConnectorIndexingLogBuilder(ctx)
+
+    task_logger.info(
+        log_builder.build(
+            "Indexing watchdog - starting",
+            mp_start_method=str(multiprocessing.get_start_method()),
+        )
+    )
+
+    if not self.request.id:
+        task_logger.error("self.request.id is None!")
+
+    client = SimpleJobClient()
+    task_logger.info(f"submitting docfetching_task with tenant_id={tenant_id}")
+
+    job = client.submit(
+        docfetching_task,
+        self.app,
+        index_attempt_id,
+        cc_pair_id,
+        search_settings_id,
+        global_version.is_ee_version(),
+        tenant_id,
+    )
+
+    if not job or not job.process:
+        result.status = IndexingWatchdogTerminalStatus.SPAWN_FAILED
+        task_logger.info(
+            log_builder.build(
+                "Indexing watchdog - finished",
+                status=str(result.status.value),
+                exit_code=str(result.exit_code),
+            )
+        )
+        return
+
+    # Ensure the process has moved out of the starting state
+    num_waits = 0
+    while True:
+        if num_waits > 15:
+            result.status = IndexingWatchdogTerminalStatus.SPAWN_NOT_ALIVE
+            task_logger.info(
+                log_builder.build(
+                    "Indexing watchdog - finished",
+                    status=str(result.status.value),
+                    exit_code=str(result.exit_code),
+                )
+            )
+            job.release()
+            return
+
+        if job.process.is_alive() or job.process.exitcode is not None:
+            break
+
+        sleep(1)
+        num_waits += 1
+
+    task_logger.info(
+        log_builder.build(
+            "Indexing watchdog - spawn succeeded",
+            pid=str(job.process.pid),
+        )
+    )
+
+    redis_connector = RedisConnector(tenant_id, cc_pair_id)
+    redis_connector_index = redis_connector.new_index(search_settings_id)
+
+    # Track the last time memory info was emitted
+    last_memory_emit_time = 0.0
+
+    try:
+        with get_session_with_current_tenant() as db_session:
+            index_attempt = get_index_attempt(
+                db_session=db_session,
+                index_attempt_id=index_attempt_id,
+                eager_load_cc_pair=True,
+            )
+            if not index_attempt:
+                raise RuntimeError("Index attempt not found")
+
+            result.connector_source = (
+                index_attempt.connector_credential_pair.connector.source.value
+            )
+
+        while True:
+            sleep(5)
+
+            time.monotonic()
+
+            # if the job is done, clean up and break
+            if job.done():
+                try:
+                    result = process_job_result(
+                        job, result.connector_source, redis_connector_index, log_builder
+                    )
+                except Exception:
+                    task_logger.exception(
+                        log_builder.build(
+                            "Indexing watchdog - spawned task exceptioned"
+                        )
+                    )
+                finally:
+                    job.release()
+                    break
+
+            # log the memory usage for tracking down memory leaks / connector-specific memory issues
+            pid = job.process.pid
+            if pid is not None:
+                # Only emit memory info once per minute (60 seconds)
+                current_time = time.monotonic()
+                if current_time - last_memory_emit_time >= 60.0:
+                    emit_process_memory(
+                        pid,
+                        "indexing_worker",
+                        {
+                            "cc_pair_id": cc_pair_id,
+                            "search_settings_id": search_settings_id,
+                            "index_attempt_id": index_attempt_id,
+                        },
+                    )
+                    last_memory_emit_time = current_time
+
+            # if the spawned task is still running, restart the check once again
+            # if the index attempt is not in a finished status
+            try:
+                with get_session_with_current_tenant() as db_session:
+                    index_attempt = get_index_attempt(
+                        db_session=db_session, index_attempt_id=index_attempt_id
+                    )
+
+                    if not index_attempt:
+                        continue
+
+                    if not index_attempt.is_finished():
+                        continue
+
+            except Exception:
+                task_logger.exception(
+                    log_builder.build(
+                        "Indexing watchdog - transient exception looking up index attempt"
+                    )
+                )
+                continue
+
+    except Exception as e:
+        result.status = IndexingWatchdogTerminalStatus.WATCHDOG_EXCEPTIONED
+        if isinstance(e, ConnectorValidationError):
+            # No need to expose full stack trace for validation errors
+            result.exception_str = str(e)
+        else:
+            result.exception_str = traceback.format_exc()
+
+    # handle exit and reporting
+    elapsed = time.monotonic() - start
+    if result.exception_str is not None:
+        # print with exception
+        try:
+            with get_session_with_current_tenant() as db_session:
+                failure_reason = (
+                    f"Spawned task exceptioned: exit_code={result.exit_code}"
+                )
+                mark_attempt_failed(
+                    ctx.index_attempt_id,
+                    db_session,
+                    failure_reason=failure_reason,
+                    full_exception_trace=result.exception_str,
+                )
+        except Exception:
+            task_logger.exception(
+                log_builder.build(
+                    "Indexing watchdog - transient exception marking index attempt as failed"
+                )
+            )
+
+        normalized_exception_str = "None"
+        if result.exception_str:
+            normalized_exception_str = result.exception_str.replace(
+                "\n", "\\n"
+            ).replace('"', '\\"')
+
+        task_logger.warning(
+            log_builder.build(
+                "Indexing watchdog - finished",
+                source=result.connector_source,
+                status=result.status.value,
+                exit_code=str(result.exit_code),
+                exception=f'"{normalized_exception_str}"',
+                elapsed=f"{elapsed:.2f}s",
+            )
+        )
+        raise RuntimeError(f"Exception encountered: traceback={result.exception_str}")
+
+    # print without exception
+    if result.status == IndexingWatchdogTerminalStatus.TERMINATED_BY_SIGNAL:
+        try:
+            with get_session_with_current_tenant() as db_session:
+                logger.exception(
+                    f"Marking attempt {index_attempt_id} as canceled due to termination signal"
+                )
+                mark_attempt_canceled(
+                    index_attempt_id,
+                    db_session,
+                    "Connector termination signal detected",
+                )
+        except Exception:
+            task_logger.exception(
+                log_builder.build(
+                    "Indexing watchdog - transient exception marking index attempt as canceled"
+                )
+            )
+
+        job.cancel()
+    elif result.status == IndexingWatchdogTerminalStatus.TERMINATED_BY_ACTIVITY_TIMEOUT:
+        try:
+            with get_session_with_current_tenant() as db_session:
+                mark_attempt_failed(
+                    index_attempt_id,
+                    db_session,
+                    "Indexing watchdog - activity timeout exceeded: "
+                    f"attempt={index_attempt_id} "
+                    f"timeout={CELERY_INDEXING_WATCHDOG_CONNECTOR_TIMEOUT}s",
+                )
+        except Exception:
+            logger.exception(
+                log_builder.build(
+                    "Indexing watchdog - transient exception marking index attempt as failed"
+                )
+            )
+        job.cancel()
+    else:
+        pass
+
+    task_logger.info(
+        log_builder.build(
+            "Indexing watchdog - finished",
+            source=result.connector_source,
+            status=str(result.status.value),
+            exit_code=str(result.exit_code),
+            elapsed=f"{elapsed:.2f}s",
+        )
+    )
diff --git a/backend/onyx/background/celery/tasks/docprocessing/heartbeat.py b/backend/onyx/background/celery/tasks/docprocessing/heartbeat.py
new file mode 100644
index 00000000000..c565eb9364b
--- /dev/null
+++ b/backend/onyx/background/celery/tasks/docprocessing/heartbeat.py
@@ -0,0 +1,36 @@
+import threading
+
+from sqlalchemy import update
+
+from onyx.configs.constants import INDEXING_WORKER_HEARTBEAT_INTERVAL
+from onyx.db.engine.sql_engine import get_session_with_current_tenant
+from onyx.db.models import IndexAttempt
+
+
+def start_heartbeat(index_attempt_id: int) -> tuple[threading.Thread, threading.Event]:
+    """Start a heartbeat thread for the given index attempt"""
+    stop_event = threading.Event()
+
+    def heartbeat_loop() -> None:
+        while not stop_event.wait(INDEXING_WORKER_HEARTBEAT_INTERVAL):
+            try:
+                with get_session_with_current_tenant() as db_session:
+                    db_session.execute(
+                        update(IndexAttempt)
+                        .where(IndexAttempt.id == index_attempt_id)
+                        .values(heartbeat_counter=IndexAttempt.heartbeat_counter + 1)
+                    )
+                    db_session.commit()
+            except Exception:
+                # Silently continue if heartbeat fails
+                pass
+
+    thread = threading.Thread(target=heartbeat_loop, daemon=True)
+    thread.start()
+    return thread, stop_event
+
+
+def stop_heartbeat(thread: threading.Thread, stop_event: threading.Event) -> None:
+    """Stop the heartbeat thread"""
+    stop_event.set()
+    thread.join(timeout=5)  # Wait up to 5 seconds for clean shutdown
diff --git a/backend/onyx/background/celery/tasks/docprocessing/tasks.py b/backend/onyx/background/celery/tasks/docprocessing/tasks.py
new file mode 100644
index 00000000000..8561d729b31
--- /dev/null
+++ b/backend/onyx/background/celery/tasks/docprocessing/tasks.py
@@ -0,0 +1,1283 @@
+import os
+import time
+import traceback
+from collections import defaultdict
+from datetime import datetime
+from datetime import timedelta
+from datetime import timezone
+from http import HTTPStatus
+from typing import Any
+from typing import cast
+
+from celery import shared_task
+from celery import Task
+from celery.exceptions import SoftTimeLimitExceeded
+from pydantic import BaseModel
+from redis.lock import Lock as RedisLock
+from sqlalchemy import select
+from sqlalchemy.orm import Session
+
+from onyx.background.celery.apps.app_base import task_logger
+from onyx.background.celery.celery_utils import httpx_init_vespa_pool
+from onyx.background.celery.tasks.beat_schedule import CLOUD_BEAT_MULTIPLIER_DEFAULT
+from onyx.background.celery.tasks.docprocessing.heartbeat import start_heartbeat
+from onyx.background.celery.tasks.docprocessing.heartbeat import stop_heartbeat
+from onyx.background.celery.tasks.docprocessing.utils import IndexingCallback
+from onyx.background.celery.tasks.docprocessing.utils import is_in_repeated_error_state
+from onyx.background.celery.tasks.docprocessing.utils import should_index
+from onyx.background.celery.tasks.docprocessing.utils import (
+    try_creating_docfetching_task,
+)
+from onyx.background.celery.tasks.models import DocProcessingContext
+from onyx.background.celery.tasks.models import IndexingWatchdogTerminalStatus
+from onyx.background.indexing.checkpointing_utils import cleanup_checkpoint
+from onyx.background.indexing.checkpointing_utils import (
+    get_index_attempts_with_old_checkpoints,
+)
+from onyx.background.indexing.job_client import SimpleJobException
+from onyx.configs.app_configs import MANAGED_VESPA
+from onyx.configs.app_configs import VESPA_CLOUD_CERT_PATH
+from onyx.configs.app_configs import VESPA_CLOUD_KEY_PATH
+from onyx.configs.constants import CELERY_GENERIC_BEAT_LOCK_TIMEOUT
+from onyx.configs.constants import CELERY_INDEXING_LOCK_TIMEOUT
+from onyx.configs.constants import OnyxCeleryPriority
+from onyx.configs.constants import OnyxCeleryQueues
+from onyx.configs.constants import OnyxCeleryTask
+from onyx.configs.constants import OnyxRedisConstants
+from onyx.configs.constants import OnyxRedisLocks
+from onyx.configs.constants import OnyxRedisSignals
+from onyx.connectors.models import ConnectorFailure
+from onyx.connectors.models import Document
+from onyx.connectors.models import IndexAttemptMetadata
+from onyx.db.connector import mark_ccpair_with_indexing_trigger
+from onyx.db.connector_credential_pair import ConnectorType
+from onyx.db.connector_credential_pair import (
+    fetch_indexable_connector_credential_pair_ids,
+)
+from onyx.db.connector_credential_pair import get_connector_credential_pair_from_id
+from onyx.db.connector_credential_pair import set_cc_pair_repeated_error_state
+from onyx.db.engine.sql_engine import get_session_with_current_tenant
+from onyx.db.engine.time_utils import get_db_current_time
+from onyx.db.enums import ConnectorCredentialPairStatus
+from onyx.db.enums import IndexingMode
+from onyx.db.enums import IndexingStatus
+from onyx.db.index_attempt import create_index_attempt_error
+from onyx.db.index_attempt import get_index_attempt
+from onyx.db.index_attempt import get_index_attempt_errors_for_cc_pair
+from onyx.db.index_attempt import IndexAttemptError
+from onyx.db.index_attempt import mark_attempt_canceled
+from onyx.db.index_attempt import mark_attempt_failed
+from onyx.db.index_attempt import mark_attempt_partially_succeeded
+from onyx.db.index_attempt import mark_attempt_succeeded
+from onyx.db.indexing_coordination import CoordinationStatus
+from onyx.db.indexing_coordination import IndexingCoordination
+from onyx.db.models import IndexAttempt
+from onyx.db.search_settings import get_active_search_settings_list
+from onyx.db.search_settings import get_current_search_settings
+from onyx.db.swap_index import check_and_perform_index_swap
+from onyx.document_index.factory import get_default_document_index
+from onyx.file_store.document_batch_storage import DocumentBatchStorage
+from onyx.file_store.document_batch_storage import get_document_batch_storage
+from onyx.httpx.httpx_pool import HttpxPool
+from onyx.indexing.embedder import DefaultIndexingEmbedder
+from onyx.indexing.indexing_pipeline import run_indexing_pipeline
+from onyx.natural_language_processing.search_nlp_models import EmbeddingModel
+from onyx.natural_language_processing.search_nlp_models import (
+    InformationContentClassificationModel,
+)
+from onyx.natural_language_processing.search_nlp_models import warm_up_bi_encoder
+from onyx.redis.redis_connector import RedisConnector
+from onyx.redis.redis_pool import get_redis_client
+from onyx.redis.redis_pool import get_redis_replica_client
+from onyx.redis.redis_pool import redis_lock_dump
+from onyx.redis.redis_pool import SCAN_ITER_COUNT_DEFAULT
+from onyx.redis.redis_utils import is_fence
+from onyx.server.runtime.onyx_runtime import OnyxRuntime
+from onyx.utils.logger import setup_logger
+from onyx.utils.logger import TaskAttemptSingleton
+from onyx.utils.middleware import make_randomized_onyx_request_id
+from onyx.utils.telemetry import optional_telemetry
+from onyx.utils.telemetry import RecordType
+from shared_configs.configs import INDEXING_MODEL_SERVER_HOST
+from shared_configs.configs import INDEXING_MODEL_SERVER_PORT
+from shared_configs.configs import MULTI_TENANT
+from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR
+
+logger = setup_logger()
+
+USER_FILE_INDEXING_LIMIT = 100
+
+
+def _get_fence_validation_block_expiration() -> int:
+    """
+    Compute the expiration time for the fence validation block signal.
+    Base expiration is 60 seconds, multiplied by the beat multiplier only in MULTI_TENANT mode.
+    """
+    base_expiration = 60  # seconds
+
+    if not MULTI_TENANT:
+        return base_expiration
+
+    try:
+        beat_multiplier = OnyxRuntime.get_beat_multiplier()
+    except Exception:
+        beat_multiplier = CLOUD_BEAT_MULTIPLIER_DEFAULT
+
+    return int(base_expiration * beat_multiplier)
+
+
+def validate_active_indexing_attempts(
+    lock_beat: RedisLock,
+) -> None:
+    """
+    Validates that active indexing attempts are still alive by checking heartbeat.
+    If no heartbeat has been received for a certain amount of time, mark the attempt as failed.
+
+    This uses the heartbeat_counter field which is incremented by active worker threads
+    every INDEXING_WORKER_HEARTBEAT_INTERVAL seconds.
+    """
+    logger.info("Validating active indexing attempts")
+    # Heartbeat timeout: if no heartbeat received for 5 minutes, consider it dead
+    # This should be much longer than INDEXING_WORKER_HEARTBEAT_INTERVAL (30s)
+    HEARTBEAT_TIMEOUT_SECONDS = 30 * 60  # 30 minutes
+
+    with get_session_with_current_tenant() as db_session:
+        cutoff_time = datetime.now(timezone.utc) - timedelta(
+            seconds=HEARTBEAT_TIMEOUT_SECONDS
+        )
+
+        # Find all active indexing attempts
+        active_attempts = (
+            db_session.execute(
+                select(IndexAttempt).where(
+                    IndexAttempt.status.in_([IndexingStatus.IN_PROGRESS]),
+                    IndexAttempt.celery_task_id.isnot(None),
+                )
+            )
+            .scalars()
+            .all()
+        )
+
+        for attempt in active_attempts:
+            lock_beat.reacquire()
+
+            # Double-check the attempt still exists and has the same status
+            fresh_attempt = get_index_attempt(db_session, attempt.id)
+            if not fresh_attempt or fresh_attempt.status.is_terminal():
+                continue
+
+            # Check if this attempt has been updated with heartbeat tracking
+            if fresh_attempt.last_heartbeat_time is None:
+                # First time seeing this attempt - initialize heartbeat tracking
+                fresh_attempt.last_heartbeat_value = fresh_attempt.heartbeat_counter
+                fresh_attempt.last_heartbeat_time = datetime.now(timezone.utc)
+                db_session.commit()
+
+                task_logger.info(
+                    f"Initialized heartbeat tracking for attempt {fresh_attempt.id}: "
+                    f"counter={fresh_attempt.heartbeat_counter}"
+                )
+                continue
+
+            # Check if the heartbeat counter has advanced since last check
+            current_counter = fresh_attempt.heartbeat_counter
+            last_known_counter = fresh_attempt.last_heartbeat_value
+            last_check_time = fresh_attempt.last_heartbeat_time
+
+            task_logger.debug(
+                f"Checking heartbeat for attempt {fresh_attempt.id}: "
+                f"current_counter={current_counter} "
+                f"last_known_counter={last_known_counter} "
+                f"last_check_time={last_check_time}"
+            )
+
+            if current_counter > last_known_counter:
+                # Heartbeat has advanced - worker is alive
+                fresh_attempt.last_heartbeat_value = current_counter
+                fresh_attempt.last_heartbeat_time = datetime.now(timezone.utc)
+                db_session.commit()
+
+                task_logger.debug(
+                    f"Heartbeat advanced for attempt {fresh_attempt.id}: "
+                    f"new_counter={current_counter}"
+                )
+                continue
+
+            # Heartbeat hasn't advanced - check if it's been too long
+            if last_check_time >= cutoff_time:
+                task_logger.debug(
+                    f"Heartbeat hasn't advanced for attempt {fresh_attempt.id} but still within timeout window"
+                )
+                continue
+
+            # No heartbeat for too long - mark as failed
+            failure_reason = (
+                f"No heartbeat received for {HEARTBEAT_TIMEOUT_SECONDS} seconds"
+            )
+
+            task_logger.warning(
+                f"Heartbeat timeout for attempt {fresh_attempt.id}: "
+                f"last_heartbeat_time={last_check_time} "
+                f"cutoff_time={cutoff_time} "
+                f"counter={current_counter}"
+            )
+
+            try:
+                mark_attempt_failed(
+                    fresh_attempt.id,
+                    db_session,
+                    failure_reason=failure_reason,
+                )
+
+                task_logger.error(
+                    f"Marked attempt {fresh_attempt.id} as failed due to heartbeat timeout"
+                )
+
+            except Exception:
+                task_logger.exception(
+                    f"Failed to mark attempt {fresh_attempt.id} as failed due to heartbeat timeout"
+                )
+
+
+class ConnectorIndexingLogBuilder:
+    def __init__(self, ctx: DocProcessingContext):
+        self.ctx = ctx
+
+    def build(self, msg: str, **kwargs: Any) -> str:
+        msg_final = (
+            f"{msg}: "
+            f"tenant_id={self.ctx.tenant_id} "
+            f"attempt={self.ctx.index_attempt_id} "
+            f"cc_pair={self.ctx.cc_pair_id} "
+            f"search_settings={self.ctx.search_settings_id}"
+        )
+
+        # Append extra keyword arguments in logfmt style
+        if kwargs:
+            extra_logfmt = " ".join(f"{key}={value}" for key, value in kwargs.items())
+            msg_final = f"{msg_final} {extra_logfmt}"
+
+        return msg_final
+
+
+def monitor_indexing_attempt_progress(
+    attempt: IndexAttempt, tenant_id: str, db_session: Session
+) -> None:
+    """
+    TODO: rewrite this docstring
+    Monitor the progress of an indexing attempt using database coordination.
+    This replaces the Redis fence-based monitoring.
+
+    Race condition handling:
+    - Uses database coordination status to track progress
+    - Only updates CC pair status based on confirmed database state
+    - Handles concurrent completion gracefully
+    """
+    if not attempt.celery_task_id:
+        # Attempt hasn't been assigned a task yet
+        return
+
+    cc_pair = get_connector_credential_pair_from_id(
+        db_session, attempt.connector_credential_pair_id
+    )
+    if not cc_pair:
+        task_logger.warning(f"CC pair not found for attempt {attempt.id}")
+        return
+
+    # Check if the CC Pair should be moved to INITIAL_INDEXING
+    if cc_pair.status == ConnectorCredentialPairStatus.SCHEDULED:
+        cc_pair.status = ConnectorCredentialPairStatus.INITIAL_INDEXING
+        db_session.commit()
+
+    # Get coordination status to track progress
+
+    coordination_status = IndexingCoordination.get_coordination_status(
+        db_session, attempt.id
+    )
+
+    current_db_time = get_db_current_time(db_session)
+    if coordination_status.found:
+        task_logger.info(
+            f"Indexing attempt progress: "
+            f"attempt={attempt.id} "
+            f"cc_pair={attempt.connector_credential_pair_id} "
+            f"search_settings={attempt.search_settings_id} "
+            f"completed_batches={coordination_status.completed_batches} "
+            f"total_batches={coordination_status.total_batches or '?'} "
+            f"total_docs={coordination_status.total_docs} "
+            f"total_failures={coordination_status.total_failures}"
+            f"elapsed={(current_db_time - attempt.time_created).seconds}"
+        )
+
+    if coordination_status.cancellation_requested:
+        task_logger.info(f"Indexing attempt {attempt.id} has been cancelled")
+        mark_attempt_canceled(attempt.id, db_session)
+        return
+
+    storage = get_document_batch_storage(
+        attempt.connector_credential_pair_id, attempt.id
+    )
+
+    # Check task completion using Celery
+    try:
+        check_indexing_completion(attempt.id, coordination_status, storage, tenant_id)
+    except Exception as e:
+        logger.exception(
+            f"Failed to monitor document processing completion: "
+            f"attempt={attempt.id} "
+            f"error={str(e)}"
+        )
+
+        # Mark the attempt as failed if monitoring fails
+        try:
+            with get_session_with_current_tenant() as db_session:
+                mark_attempt_failed(
+                    attempt.id,
+                    db_session,
+                    failure_reason=f"Processing monitoring failed: {str(e)}",
+                    full_exception_trace=traceback.format_exc(),
+                )
+
+        except Exception:
+            logger.exception("Failed to mark attempt as failed")
+
+        # Try to clean up storage
+        try:
+            logger.info(f"Cleaning up storage after monitoring failure: {storage}")
+            storage.cleanup_all_batches()
+        except Exception:
+            logger.exception("Failed to cleanup storage after monitoring failure")
+
+
+def check_indexing_completion(
+    index_attempt_id: int,
+    coordination_status: CoordinationStatus,
+    storage: DocumentBatchStorage,
+    tenant_id: str,
+) -> None:
+
+    logger.info(
+        f"Checking for indexing completion: "
+        f"attempt={index_attempt_id} "
+        f"tenant={tenant_id}"
+    )
+
+    # Check if indexing is complete and all batches are processed
+    batches_total = coordination_status.total_batches
+    batches_processed = coordination_status.completed_batches
+    indexing_completed = (
+        batches_total is not None and batches_processed >= batches_total
+    )
+
+    logger.info(
+        f"Indexing status: "
+        f"indexing_completed={indexing_completed} "
+        f"batches_processed={batches_processed}/{batches_total or '?'} "
+        f"total_docs={coordination_status.total_docs} "
+        f"total_chunks={coordination_status.total_chunks} "
+        f"total_failures={coordination_status.total_failures}"
+    )
+
+    # Update progress tracking and check for stalls
+    with get_session_with_current_tenant() as db_session:
+        # Update progress tracking
+        timed_out = not IndexingCoordination.update_progress_tracking(
+            db_session, index_attempt_id, batches_processed
+        )
+
+        # Check for stalls (3-6 hour timeout)
+        if timed_out:
+            logger.error(
+                f"Indexing attempt {index_attempt_id} has been indexing for 3-6 hours without progress. "
+                f"Marking it as failed."
+            )
+            mark_attempt_failed(
+                index_attempt_id, db_session, failure_reason="Stalled indexing"
+            )
+
+    # check again on the next check_for_indexing task
+    # TODO: on the cloud this is currently 25 minutes at most, which
+    # is honestly too slow. We should either increase the frequency of
+    # this task or change where we check for completion.
+    if not indexing_completed:
+        return
+
+    # If processing is complete, handle completion
+    logger.info(f"Connector indexing finished for index attempt {index_attempt_id}.")
+
+    # All processing is complete
+    total_failures = coordination_status.total_failures
+
+    with get_session_with_current_tenant() as db_session:
+        if total_failures == 0:
+            attempt = mark_attempt_succeeded(index_attempt_id, db_session)
+            logger.info(f"Index attempt {index_attempt_id} completed successfully")
+        else:
+            attempt = mark_attempt_partially_succeeded(index_attempt_id, db_session)
+            logger.info(
+                f"Index attempt {index_attempt_id} completed with {total_failures} failures"
+            )
+
+        # Update CC pair status if successful
+        cc_pair = get_connector_credential_pair_from_id(
+            db_session, attempt.connector_credential_pair_id
+        )
+        if cc_pair is None:
+            raise RuntimeError(
+                f"CC pair {attempt.connector_credential_pair_id} not found in database"
+            )
+
+        if attempt.status.is_successful():
+            cc_pair.last_successful_index_time = attempt.poll_range_end
+            if cc_pair.status in [
+                ConnectorCredentialPairStatus.SCHEDULED,
+                ConnectorCredentialPairStatus.INITIAL_INDEXING,
+            ]:
+                cc_pair.status = ConnectorCredentialPairStatus.ACTIVE
+                db_session.commit()
+
+            # Clear repeated error state on success
+            if cc_pair.in_repeated_error_state:
+                cc_pair.in_repeated_error_state = False
+                db_session.commit()
+
+            if attempt.status == IndexingStatus.SUCCESS:
+                logger.info(
+                    f"Resolving indexing entity errors for attempt {index_attempt_id}"
+                )
+                _resolve_indexing_entity_errors(
+                    cc_pair_id=attempt.connector_credential_pair_id,
+                    db_session=db_session,
+                )
+
+            # TODO: make it so we don't need this (might already be true)
+            redis_connector = RedisConnector(
+                tenant_id, attempt.connector_credential_pair_id
+            )
+            redis_connector_index = redis_connector.new_index(
+                attempt.search_settings_id
+            )
+            redis_connector_index.set_generator_complete(HTTPStatus.OK.value)
+
+    # Clean up FileStore storage (still needed for document batches during transition)
+    try:
+        logger.info(f"Cleaning up storage after indexing completion: {storage}")
+        storage.cleanup_all_batches()
+    except Exception:
+        logger.exception("Failed to clean up document batches - continuing")
+
+    logger.info(f"Database coordination completed for attempt {index_attempt_id}")
+
+
+def _resolve_indexing_entity_errors(
+    cc_pair_id: int,
+    db_session: Session,
+) -> None:
+    unresolved_errors = get_index_attempt_errors_for_cc_pair(
+        cc_pair_id=cc_pair_id,
+        unresolved_only=True,
+        db_session=db_session,
+    )
+    for error in unresolved_errors:
+        if error.entity_id:
+            error.is_resolved = True
+            db_session.add(error)
+    db_session.commit()
+
+
+@shared_task(
+    name=OnyxCeleryTask.CHECK_FOR_INDEXING,
+    soft_time_limit=300,
+    bind=True,
+)
+def check_for_indexing(self: Task, *, tenant_id: str) -> int | None:
+    """a lightweight task used to kick off the pipeline of indexing tasks.
+    Occcasionally does some validation of existing state to clear up error conditions.
+
+    This task is the entrypoint for the full "indexing pipeline", which is composed
+    of two tasks: "docfetching" and "docprocessing". More details in
+    the docfetching task (OnyxCeleryTask.CONNECTOR_DOC_FETCHING_TASK).
+
+    For cc pairs that should be indexed (see should_index()), this task
+    calls try_creating_docfetching_task, which creates a docfetching task.
+    All the logic for determining what state the indexing pipeline is in
+    w.r.t previous failed attempt, checkpointing, etc is handled in the docfetching task.
+    """
+
+    time_start = time.monotonic()
+    task_logger.warning("check_for_indexing - Starting")
+
+    tasks_created = 0
+    locked = False
+    redis_client = get_redis_client()
+    redis_client_replica = get_redis_replica_client()
+
+    # we need to use celery's redis client to access its redis data
+    # (which lives on a different db number)
+    # redis_client_celery: Redis = self.app.broker_connection().channel().client  # type: ignore
+
+    lock_beat: RedisLock = redis_client.lock(
+        OnyxRedisLocks.CHECK_INDEXING_BEAT_LOCK,
+        timeout=CELERY_GENERIC_BEAT_LOCK_TIMEOUT,
+    )
+
+    # these tasks should never overlap
+    if not lock_beat.acquire(blocking=False):
+        return None
+
+    try:
+        locked = True
+
+        # SPECIAL 0/3: sync lookup table for active fences
+        # we want to run this less frequently than the overall task
+        if not redis_client.exists(OnyxRedisSignals.BLOCK_BUILD_FENCE_LOOKUP_TABLE):
+            # build a lookup table of existing fences
+            # this is just a migration concern and should be unnecessary once
+            # lookup tables are rolled out
+            for key_bytes in redis_client_replica.scan_iter(
+                count=SCAN_ITER_COUNT_DEFAULT
+            ):
+                if is_fence(key_bytes) and not redis_client.sismember(
+                    OnyxRedisConstants.ACTIVE_FENCES, key_bytes
+                ):
+                    logger.warning(f"Adding {key_bytes} to the lookup table.")
+                    redis_client.sadd(OnyxRedisConstants.ACTIVE_FENCES, key_bytes)
+
+            redis_client.set(
+                OnyxRedisSignals.BLOCK_BUILD_FENCE_LOOKUP_TABLE,
+                1,
+                ex=OnyxRuntime.get_build_fence_lookup_table_interval(),
+            )
+
+        # 1/3: KICKOFF
+
+        # check for search settings swap
+        with get_session_with_current_tenant() as db_session:
+            old_search_settings = check_and_perform_index_swap(db_session=db_session)
+            current_search_settings = get_current_search_settings(db_session)
+            # So that the first time users aren't surprised by really slow speed of first
+            # batch of documents indexed
+            if current_search_settings.provider_type is None and not MULTI_TENANT:
+                if old_search_settings:
+                    embedding_model = EmbeddingModel.from_db_model(
+                        search_settings=current_search_settings,
+                        server_host=INDEXING_MODEL_SERVER_HOST,
+                        server_port=INDEXING_MODEL_SERVER_PORT,
+                    )
+
+                    # only warm up if search settings were changed
+                    warm_up_bi_encoder(
+                        embedding_model=embedding_model,
+                    )
+
+        # gather cc_pair_ids + current search settings
+        lock_beat.reacquire()
+        with get_session_with_current_tenant() as db_session:
+            standard_cc_pair_ids = fetch_indexable_connector_credential_pair_ids(
+                db_session, connector_type=ConnectorType.STANDARD
+            )
+            # only index 50 user files at a time. This makes sense since user files are
+            # indexed only once, and then they are done. In practice, we would rarely
+            # have more than `USER_FILE_INDEXING_LIMIT` user files to index.
+            user_file_cc_pair_ids = fetch_indexable_connector_credential_pair_ids(
+                db_session,
+                connector_type=ConnectorType.USER_FILE,
+                limit=USER_FILE_INDEXING_LIMIT,
+            )
+            cc_pair_ids = standard_cc_pair_ids + user_file_cc_pair_ids
+
+            # NOTE: some potential race conditions here, but the worse case is
+            # kicking off some "invalid" indexing tasks which will just fail
+            search_settings_list = get_active_search_settings_list(db_session)
+
+        current_search_settings = next(
+            search_settings_instance
+            for search_settings_instance in search_settings_list
+            if search_settings_instance.status.is_current()
+        )
+
+        # mark CC Pairs that are repeatedly failing as in repeated error state
+        with get_session_with_current_tenant() as db_session:
+            for cc_pair_id in cc_pair_ids:
+                lock_beat.reacquire()
+
+                if is_in_repeated_error_state(
+                    cc_pair_id=cc_pair_id,
+                    search_settings_id=current_search_settings.id,
+                    db_session=db_session,
+                ):
+                    set_cc_pair_repeated_error_state(
+                        db_session=db_session,
+                        cc_pair_id=cc_pair_id,
+                        in_repeated_error_state=True,
+                    )
+
+        # kick off index attempts
+        for cc_pair_id in cc_pair_ids:
+            lock_beat.reacquire()
+
+            with get_session_with_current_tenant() as db_session:
+                for search_settings_instance in search_settings_list:
+                    # skip non-live search settings that don't have background reindex enabled
+                    # those should just auto-change to live shortly after creation without
+                    # requiring any indexing till that point
+                    if (
+                        not search_settings_instance.status.is_current()
+                        and not search_settings_instance.background_reindex_enabled
+                    ):
+                        task_logger.warning("SKIPPING DUE TO NON-LIVE SEARCH SETTINGS")
+
+                        continue
+
+                    # Check if there's already an active indexing attempt for this CC pair + search settings
+                    # This prevents race conditions where multiple indexing attempts could be created
+                    # We check for any non-terminal status (NOT_STARTED, IN_PROGRESS)
+                    existing_attempts = (
+                        db_session.execute(
+                            select(IndexAttempt).where(
+                                IndexAttempt.connector_credential_pair_id == cc_pair_id,
+                                IndexAttempt.search_settings_id
+                                == search_settings_instance.id,
+                                IndexAttempt.status.in_(
+                                    [
+                                        IndexingStatus.NOT_STARTED,
+                                        IndexingStatus.IN_PROGRESS,
+                                    ]
+                                ),
+                            )
+                        )
+                        .scalars()
+                        .all()
+                    )
+
+                    if existing_attempts:
+                        task_logger.debug(
+                            f"check_for_indexing - Skipping due to active indexing attempt: "
+                            f"cc_pair={cc_pair_id} search_settings={search_settings_instance.id} "
+                            f"active_attempts={[a.id for a in existing_attempts]}"
+                        )
+                        continue
+
+                    cc_pair = get_connector_credential_pair_from_id(
+                        db_session=db_session,
+                        cc_pair_id=cc_pair_id,
+                    )
+                    if not cc_pair:
+                        task_logger.warning(
+                            f"check_for_indexing - CC pair not found: cc_pair={cc_pair_id}"
+                        )
+                        continue
+
+                    if not should_index(
+                        cc_pair=cc_pair,
+                        search_settings_instance=search_settings_instance,
+                        secondary_index_building=len(search_settings_list) > 1,
+                        db_session=db_session,
+                    ):
+                        task_logger.debug(
+                            f"check_for_indexing - Not indexing cc_pair_id: {cc_pair_id} "
+                            f"search_settings={search_settings_instance.id}, "
+                            f"secondary_index_building={len(search_settings_list) > 1}"
+                        )
+                        continue
+
+                    task_logger.debug(
+                        f"check_for_indexing - Will index cc_pair_id: {cc_pair_id} "
+                        f"search_settings={search_settings_instance.id}, "
+                        f"secondary_index_building={len(search_settings_list) > 1}"
+                    )
+
+                    reindex = False
+                    if search_settings_instance.status.is_current():
+                        # the indexing trigger is only checked and cleared with the current search settings
+                        if cc_pair.indexing_trigger is not None:
+                            if cc_pair.indexing_trigger == IndexingMode.REINDEX:
+                                reindex = True
+
+                            task_logger.info(
+                                f"Connector indexing manual trigger detected: "
+                                f"cc_pair={cc_pair.id} "
+                                f"search_settings={search_settings_instance.id} "
+                                f"indexing_mode={cc_pair.indexing_trigger}"
+                            )
+
+                            mark_ccpair_with_indexing_trigger(
+                                cc_pair.id, None, db_session
+                            )
+
+                    # using a task queue and only allowing one task per cc_pair/search_setting
+                    # prevents us from starving out certain attempts
+                    attempt_id = try_creating_docfetching_task(
+                        self.app,
+                        cc_pair,
+                        search_settings_instance,
+                        reindex,
+                        db_session,
+                        redis_client,
+                        tenant_id,
+                    )
+                    if attempt_id:
+                        task_logger.info(
+                            f"Connector indexing queued: "
+                            f"index_attempt={attempt_id} "
+                            f"cc_pair={cc_pair.id} "
+                            f"search_settings={search_settings_instance.id}"
+                        )
+                        tasks_created += 1
+                    else:
+                        task_logger.info(
+                            f"Failed to create indexing task: "
+                            f"cc_pair={cc_pair.id} "
+                            f"search_settings={search_settings_instance.id}"
+                        )
+
+        lock_beat.reacquire()
+
+        # 2/3: VALIDATE
+
+        # Check for inconsistent index attempts - active attempts without task IDs
+        # This can happen if attempt creation fails partway through
+        with get_session_with_current_tenant() as db_session:
+            inconsistent_attempts = (
+                db_session.execute(
+                    select(IndexAttempt).where(
+                        IndexAttempt.status.in_(
+                            [IndexingStatus.NOT_STARTED, IndexingStatus.IN_PROGRESS]
+                        ),
+                        IndexAttempt.celery_task_id.is_(None),
+                    )
+                )
+                .scalars()
+                .all()
+            )
+
+            for attempt in inconsistent_attempts:
+                lock_beat.reacquire()
+
+                # Double-check the attempt still has the inconsistent state
+                fresh_attempt = get_index_attempt(db_session, attempt.id)
+                if (
+                    not fresh_attempt
+                    or fresh_attempt.celery_task_id
+                    or fresh_attempt.status.is_terminal()
+                ):
+                    continue
+
+                failure_reason = (
+                    f"Inconsistent index attempt found - active status without Celery task: "
+                    f"index_attempt={attempt.id} "
+                    f"cc_pair={attempt.connector_credential_pair_id} "
+                    f"search_settings={attempt.search_settings_id}"
+                )
+                task_logger.error(failure_reason)
+                mark_attempt_failed(
+                    attempt.id, db_session, failure_reason=failure_reason
+                )
+
+        lock_beat.reacquire()
+        # we want to run this less frequently than the overall task
+        if not redis_client.exists(OnyxRedisSignals.BLOCK_VALIDATE_INDEXING_FENCES):
+            # Check for orphaned index attempts that have Celery task IDs but no actual running tasks
+            # This can happen if workers crash or tasks are terminated unexpectedly
+            # We reuse the same Redis signal name for backwards compatibility
+            try:
+                validate_active_indexing_attempts(lock_beat)
+            except Exception:
+                task_logger.exception(
+                    "Exception while validating active indexing attempts"
+                )
+
+            redis_client.set(
+                OnyxRedisSignals.BLOCK_VALIDATE_INDEXING_FENCES,
+                1,
+                ex=_get_fence_validation_block_expiration(),
+            )
+
+        # 3/3: FINALIZE - Monitor active indexing attempts using database
+        lock_beat.reacquire()
+        with get_session_with_current_tenant() as db_session:
+            # Monitor all active indexing attempts directly from the database
+            # This replaces the Redis fence-based monitoring
+            active_attempts = (
+                db_session.execute(
+                    select(IndexAttempt).where(
+                        IndexAttempt.status.in_(
+                            [IndexingStatus.NOT_STARTED, IndexingStatus.IN_PROGRESS]
+                        )
+                    )
+                )
+                .scalars()
+                .all()
+            )
+
+            for attempt in active_attempts:
+                try:
+                    monitor_indexing_attempt_progress(attempt, tenant_id, db_session)
+                except Exception:
+                    task_logger.exception(f"Error monitoring attempt {attempt.id}")
+
+                lock_beat.reacquire()
+
+    except SoftTimeLimitExceeded:
+        task_logger.info(
+            "Soft time limit exceeded, task is being terminated gracefully."
+        )
+    except Exception:
+        task_logger.exception("Unexpected exception during indexing check")
+    finally:
+        if locked:
+            if lock_beat.owned():
+                lock_beat.release()
+            else:
+                task_logger.error(
+                    "check_for_indexing - Lock not owned on completion: "
+                    f"tenant={tenant_id}"
+                )
+                redis_lock_dump(lock_beat, redis_client)
+
+    time_elapsed = time.monotonic() - time_start
+    task_logger.info(f"check_for_indexing finished: elapsed={time_elapsed:.2f}")
+    return tasks_created
+
+
+# primary
+@shared_task(
+    name=OnyxCeleryTask.CHECK_FOR_CHECKPOINT_CLEANUP,
+    soft_time_limit=300,
+    bind=True,
+)
+def check_for_checkpoint_cleanup(self: Task, *, tenant_id: str) -> None:
+    """Clean up old checkpoints that are older than 7 days."""
+    locked = False
+    redis_client = get_redis_client(tenant_id=tenant_id)
+    lock: RedisLock = redis_client.lock(
+        OnyxRedisLocks.CHECK_CHECKPOINT_CLEANUP_BEAT_LOCK,
+        timeout=CELERY_GENERIC_BEAT_LOCK_TIMEOUT,
+    )
+
+    # these tasks should never overlap
+    if not lock.acquire(blocking=False):
+        return None
+
+    try:
+        locked = True
+        with get_session_with_current_tenant() as db_session:
+            old_attempts = get_index_attempts_with_old_checkpoints(db_session)
+            for attempt in old_attempts:
+                task_logger.info(
+                    f"Cleaning up checkpoint for index attempt {attempt.id}"
+                )
+                self.app.send_task(
+                    OnyxCeleryTask.CLEANUP_CHECKPOINT,
+                    kwargs={
+                        "index_attempt_id": attempt.id,
+                        "tenant_id": tenant_id,
+                    },
+                    queue=OnyxCeleryQueues.CHECKPOINT_CLEANUP,
+                    priority=OnyxCeleryPriority.MEDIUM,
+                )
+    except Exception:
+        task_logger.exception("Unexpected exception during checkpoint cleanup")
+        return None
+    finally:
+        if locked:
+            if lock.owned():
+                lock.release()
+            else:
+                task_logger.error(
+                    "check_for_checkpoint_cleanup - Lock not owned on completion: "
+                    f"tenant={tenant_id}"
+                )
+
+
+# light worker
+@shared_task(
+    name=OnyxCeleryTask.CLEANUP_CHECKPOINT,
+    bind=True,
+)
+def cleanup_checkpoint_task(
+    self: Task, *, index_attempt_id: int, tenant_id: str | None
+) -> None:
+    """Clean up a checkpoint for a given index attempt"""
+
+    start = time.monotonic()
+
+    try:
+        with get_session_with_current_tenant() as db_session:
+            cleanup_checkpoint(db_session, index_attempt_id)
+    finally:
+        elapsed = time.monotonic() - start
+
+        task_logger.info(
+            f"cleanup_checkpoint_task completed: tenant_id={tenant_id} "
+            f"index_attempt_id={index_attempt_id} "
+            f"elapsed={elapsed:.2f}"
+        )
+
+
+class DocumentProcessingBatch(BaseModel):
+    """Data structure for a document processing batch."""
+
+    batch_id: str
+    index_attempt_id: int
+    cc_pair_id: int
+    tenant_id: str
+    batch_num: int
+
+
+def _check_failure_threshold(
+    total_failures: int,
+    document_count: int,
+    batch_num: int,
+    last_failure: ConnectorFailure | None,
+) -> None:
+    """Check if we've hit the failure threshold and raise an appropriate exception if so.
+
+    We consider the threshold hit if:
+    1. We have more than 3 failures AND
+    2. Failures account for more than 10% of processed documents
+    """
+    failure_ratio = total_failures / (document_count or 1)
+
+    FAILURE_THRESHOLD = 3
+    FAILURE_RATIO_THRESHOLD = 0.1
+    if total_failures > FAILURE_THRESHOLD and failure_ratio > FAILURE_RATIO_THRESHOLD:
+        logger.error(
+            f"Connector run failed with '{total_failures}' errors "
+            f"after '{batch_num}' batches."
+        )
+        if last_failure and last_failure.exception:
+            raise last_failure.exception from last_failure.exception
+
+        raise RuntimeError(
+            f"Connector run encountered too many errors, aborting. "
+            f"Last error: {last_failure}"
+        )
+
+
+def _resolve_indexing_document_errors(
+    cc_pair_id: int,
+    failures: list[ConnectorFailure],
+    document_batch: list[Document],
+) -> None:
+    with get_session_with_current_tenant() as db_session_temp:
+        # get previously unresolved errors
+        unresolved_errors = get_index_attempt_errors_for_cc_pair(
+            cc_pair_id=cc_pair_id,
+            unresolved_only=True,
+            db_session=db_session_temp,
+        )
+        doc_id_to_unresolved_errors: dict[str, list[IndexAttemptError]] = defaultdict(
+            list
+        )
+        for error in unresolved_errors:
+            if error.document_id:
+                doc_id_to_unresolved_errors[error.document_id].append(error)
+
+        # resolve errors for documents that were successfully indexed
+        failed_document_ids = [
+            failure.failed_document.document_id
+            for failure in failures
+            if failure.failed_document
+        ]
+        successful_document_ids = [
+            document.id
+            for document in document_batch
+            if document.id not in failed_document_ids
+        ]
+        for document_id in successful_document_ids:
+            if document_id not in doc_id_to_unresolved_errors:
+                continue
+
+            logger.info(f"Resolving IndexAttemptError for document '{document_id}'")
+            for error in doc_id_to_unresolved_errors[document_id]:
+                error.is_resolved = True
+                db_session_temp.add(error)
+
+        db_session_temp.commit()
+
+
+@shared_task(
+    name=OnyxCeleryTask.DOCPROCESSING_TASK,
+    bind=True,
+)
+def docprocessing_task(
+    self: Task,
+    index_attempt_id: int,
+    cc_pair_id: int,
+    tenant_id: str,
+    batch_num: int,
+) -> None:
+    """Process a batch of documents through the indexing pipeline.
+
+    This task retrieves documents from storage and processes them through
+    the indexing pipeline (embedding + vector store indexing).
+    """
+    # Start heartbeat for this indexing attempt
+    heartbeat_thread, stop_event = start_heartbeat(index_attempt_id)
+    try:
+        _docprocessing_task(index_attempt_id, cc_pair_id, tenant_id, batch_num)
+    finally:
+        stop_heartbeat(heartbeat_thread, stop_event)  # Stop heartbeat before exiting
+
+
+def _docprocessing_task(
+    index_attempt_id: int,
+    cc_pair_id: int,
+    tenant_id: str,
+    batch_num: int,
+) -> None:
+    start_time = time.monotonic()
+
+    # set the indexing attempt ID so that all log messages from this process
+    # will have it added as a prefix
+    TaskAttemptSingleton.set_cc_and_index_id(index_attempt_id, cc_pair_id)
+    if tenant_id:
+        CURRENT_TENANT_ID_CONTEXTVAR.set(tenant_id)
+
+    task_logger.info(
+        f"Processing document batch: "
+        f"attempt={index_attempt_id} "
+        f"batch_num={batch_num} "
+    )
+
+    # Get the document batch storage
+    storage = get_document_batch_storage(cc_pair_id, index_attempt_id)
+
+    redis_connector = RedisConnector(tenant_id, cc_pair_id)
+    r = get_redis_client(tenant_id=tenant_id)
+
+    # 20 is the documented default for httpx max_keepalive_connections
+    if MANAGED_VESPA:
+        httpx_init_vespa_pool(
+            20, ssl_cert=VESPA_CLOUD_CERT_PATH, ssl_key=VESPA_CLOUD_KEY_PATH
+        )
+    else:
+        httpx_init_vespa_pool(20)
+
+    # dummy lock to satisfy linter
+    per_batch_lock: RedisLock | None = None
+    try:
+        # Retrieve documents from storage
+        documents = storage.get_batch(batch_num)
+        if not documents:
+            task_logger.error(f"No documents found for batch {batch_num}")
+            return
+
+        with get_session_with_current_tenant() as db_session:
+            # matches parts of _run_indexing
+            index_attempt = get_index_attempt(
+                db_session,
+                index_attempt_id,
+                eager_load_cc_pair=True,
+                eager_load_search_settings=True,
+            )
+            if not index_attempt:
+                raise RuntimeError(f"Index attempt {index_attempt_id} not found")
+
+            if index_attempt.search_settings is None:
+                raise ValueError("Search settings must be set for indexing")
+
+            if (
+                index_attempt.celery_task_id is None
+                or index_attempt.status.is_terminal()
+            ):
+                raise RuntimeError(
+                    f"Index attempt {index_attempt_id} is not running, status {index_attempt.status}"
+                )
+
+            redis_connector_index = redis_connector.new_index(
+                index_attempt.search_settings.id
+            )
+
+            cross_batch_db_lock: RedisLock = r.lock(
+                redis_connector_index.db_lock_key,
+                timeout=CELERY_INDEXING_LOCK_TIMEOUT,
+                thread_local=False,
+            )
+            # set thread_local=False since we don't control what thread the indexing/pruning
+            # might run our callback with
+            per_batch_lock = cast(
+                RedisLock,
+                r.lock(
+                    redis_connector_index.lock_key_by_batch(batch_num),
+                    timeout=CELERY_INDEXING_LOCK_TIMEOUT,
+                    thread_local=False,
+                ),
+            )
+
+            acquired = per_batch_lock.acquire(blocking=False)
+            if not acquired:
+                logger.warning(
+                    f"Indexing batch task already running, exiting...: "
+                    f"index_attempt={index_attempt_id} "
+                    f"cc_pair={cc_pair_id} "
+                    f"search_settings={index_attempt.search_settings.id} "
+                    f"batch_num={batch_num}"
+                )
+
+                raise SimpleJobException(
+                    f"Indexing batch task already running, exiting...: "
+                    f"index_attempt={index_attempt_id} "
+                    f"cc_pair={cc_pair_id} "
+                    f"search_settings={index_attempt.search_settings.id} "
+                    f"batch_num={batch_num}",
+                    code=IndexingWatchdogTerminalStatus.TASK_ALREADY_RUNNING.code,
+                )
+
+            callback = IndexingCallback(
+                os.getppid(),
+                redis_connector,
+                per_batch_lock,
+                r,
+            )
+            # TODO: right now this is the only thing the callback is used for,
+            # probably there is a simpler way to handle pausing
+            if callback.should_stop():
+                raise RuntimeError("Docprocessing cancelled by connector pausing")
+
+            # Set up indexing pipeline components
+            embedding_model = DefaultIndexingEmbedder.from_db_search_settings(
+                search_settings=index_attempt.search_settings,
+                callback=callback,
+            )
+
+            information_content_classification_model = (
+                InformationContentClassificationModel()
+            )
+
+            document_index = get_default_document_index(
+                index_attempt.search_settings,
+                None,
+                httpx_client=HttpxPool.get("vespa"),
+            )
+
+            # Set up metadata for this batch
+            index_attempt_metadata = IndexAttemptMetadata(
+                attempt_id=index_attempt_id,
+                connector_id=index_attempt.connector_credential_pair.connector.id,
+                credential_id=index_attempt.connector_credential_pair.credential.id,
+                request_id=make_randomized_onyx_request_id("DIP"),
+                structured_id=f"{tenant_id}:{cc_pair_id}:{index_attempt_id}:{batch_num}",
+                batch_num=batch_num,
+            )
+
+            # Process documents through indexing pipeline
+            task_logger.info(
+                f"Processing {len(documents)} documents through indexing pipeline"
+            )
+
+            per_batch_lock.reacquire()
+
+            # real work happens here!
+            index_pipeline_result = run_indexing_pipeline(
+                embedder=embedding_model,
+                information_content_classification_model=information_content_classification_model,
+                document_index=document_index,
+                ignore_time_skip=True,  # Documents are already filtered during extraction
+                db_session=db_session,
+                tenant_id=tenant_id,
+                document_batch=documents,
+                index_attempt_metadata=index_attempt_metadata,
+            )
+            per_batch_lock.reacquire()
+
+        # Update batch completion and document counts atomically using database coordination
+
+        with get_session_with_current_tenant() as db_session, cross_batch_db_lock:
+            IndexingCoordination.update_batch_completion_and_docs(
+                db_session=db_session,
+                index_attempt_id=index_attempt_id,
+                total_docs_indexed=index_pipeline_result.total_docs,
+                new_docs_indexed=index_pipeline_result.new_docs,
+                total_chunks=index_pipeline_result.total_chunks,
+            )
+
+            _resolve_indexing_document_errors(
+                cc_pair_id,
+                index_pipeline_result.failures,
+                documents,
+            )
+
+        coordination_status = None
+        # Record failures in the database
+        if index_pipeline_result.failures:
+            with get_session_with_current_tenant() as db_session:
+                for failure in index_pipeline_result.failures:
+                    create_index_attempt_error(
+                        index_attempt_id,
+                        cc_pair_id,
+                        failure,
+                        db_session,
+                    )
+            # Use database state instead of FileStore for failure checking
+            with get_session_with_current_tenant() as db_session:
+                coordination_status = IndexingCoordination.get_coordination_status(
+                    db_session, index_attempt_id
+                )
+                _check_failure_threshold(
+                    coordination_status.total_failures,
+                    coordination_status.total_docs,
+                    batch_num,
+                    index_pipeline_result.failures[-1],
+                )
+
+        # Add telemetry for indexing progress using database coordination status
+        # only re-fetch coordination status if necessary
+        if coordination_status is None:
+            with get_session_with_current_tenant() as db_session:
+                coordination_status = IndexingCoordination.get_coordination_status(
+                    db_session, index_attempt_id
+                )
+
+        optional_telemetry(
+            record_type=RecordType.INDEXING_PROGRESS,
+            data={
+                "index_attempt_id": index_attempt_id,
+                "cc_pair_id": cc_pair_id,
+                "current_docs_indexed": coordination_status.total_docs,
+                "current_chunks_indexed": coordination_status.total_chunks,
+                "source": index_attempt.connector_credential_pair.connector.source.value,
+                "completed_batches": coordination_status.completed_batches,
+                "total_batches": coordination_status.total_batches,
+            },
+            tenant_id=tenant_id,
+        )
+        # Clean up this batch after successful processing
+        storage.delete_batch_by_num(batch_num)
+
+        elapsed_time = time.monotonic() - start_time
+        task_logger.info(
+            f"Completed document batch processing: "
+            f"index_attempt={index_attempt_id} "
+            f"cc_pair={cc_pair_id} "
+            f"search_settings={index_attempt.search_settings.id} "
+            f"batch_num={batch_num} "
+            f"docs={len(documents)} "
+            f"chunks={index_pipeline_result.total_chunks} "
+            f"failures={len(index_pipeline_result.failures)} "
+            f"elapsed={elapsed_time:.2f}s"
+        )
+
+    except Exception:
+        task_logger.exception(
+            f"Document batch processing failed: "
+            f"batch_num={batch_num} "
+            f"attempt={index_attempt_id} "
+        )
+
+        # on failure, signal completion with an error to unblock the watchdog
+        with get_session_with_current_tenant() as db_session:
+            index_attempt = get_index_attempt(db_session, index_attempt_id)
+            if index_attempt and index_attempt.search_settings:
+                redis_connector_index = redis_connector.new_index(
+                    index_attempt.search_settings.id
+                )
+                redis_connector_index.set_generator_complete(
+                    HTTPStatus.INTERNAL_SERVER_ERROR.value
+                )
+
+        raise
+    finally:
+        if per_batch_lock and per_batch_lock.owned():
+            per_batch_lock.release()
diff --git a/backend/onyx/background/celery/tasks/indexing/utils.py b/backend/onyx/background/celery/tasks/docprocessing/utils.py
similarity index 54%
rename from backend/onyx/background/celery/tasks/indexing/utils.py
rename to backend/onyx/background/celery/tasks/docprocessing/utils.py
index 920bc025f3e..99a32bf6604 100644
--- a/backend/onyx/background/celery/tasks/indexing/utils.py
+++ b/backend/onyx/background/celery/tasks/docprocessing/utils.py
@@ -1,10 +1,8 @@
 import time
 from datetime import datetime
 from datetime import timezone
-from typing import Any
-from typing import cast
+from uuid import uuid4
 
-import redis
 from celery import Celery
 from redis import Redis
 from redis.exceptions import LockError
@@ -12,8 +10,6 @@
 from sqlalchemy.orm import Session
 
 from onyx.background.celery.apps.app_base import task_logger
-from onyx.background.celery.celery_redis import celery_find_task
-from onyx.background.celery.celery_redis import celery_get_unacked_task_ids
 from onyx.configs.app_configs import DISABLE_INDEX_UPDATE_ON_SWAP
 from onyx.configs.constants import CELERY_GENERIC_BEAT_LOCK_TIMEOUT
 from onyx.configs.constants import DANSWER_REDIS_FUNCTION_LOCK_PREFIX
@@ -21,27 +17,19 @@
 from onyx.configs.constants import OnyxCeleryPriority
 from onyx.configs.constants import OnyxCeleryQueues
 from onyx.configs.constants import OnyxCeleryTask
-from onyx.configs.constants import OnyxRedisConstants
 from onyx.db.connector_credential_pair import get_connector_credential_pair_from_id
-from onyx.db.engine.sql_engine import get_session_with_current_tenant
 from onyx.db.engine.time_utils import get_db_current_time
 from onyx.db.enums import ConnectorCredentialPairStatus
 from onyx.db.enums import IndexingStatus
 from onyx.db.enums import IndexModelStatus
-from onyx.db.index_attempt import create_index_attempt
-from onyx.db.index_attempt import delete_index_attempt
-from onyx.db.index_attempt import get_all_index_attempts_by_status
-from onyx.db.index_attempt import get_index_attempt
 from onyx.db.index_attempt import get_last_attempt_for_cc_pair
 from onyx.db.index_attempt import get_recent_attempts_for_cc_pair
 from onyx.db.index_attempt import mark_attempt_failed
+from onyx.db.indexing_coordination import IndexingCoordination
 from onyx.db.models import ConnectorCredentialPair
-from onyx.db.models import IndexAttempt
 from onyx.db.models import SearchSettings
 from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
 from onyx.redis.redis_connector import RedisConnector
-from onyx.redis.redis_connector_index import RedisConnectorIndex
-from onyx.redis.redis_connector_index import RedisConnectorIndexPayload
 from onyx.redis.redis_pool import redis_lock_dump
 from onyx.utils.logger import setup_logger
 
@@ -50,54 +38,6 @@
 NUM_REPEAT_ERRORS_BEFORE_REPEATED_ERROR_STATE = 5
 
 
-def get_unfenced_index_attempt_ids(db_session: Session, r: redis.Redis) -> list[int]:
-    """Gets a list of unfenced index attempts. Should not be possible, so we'd typically
-    want to clean them up.
-
-    Unfenced = attempt not in terminal state and fence does not exist.
-    """
-    unfenced_attempts: list[int] = []
-
-    # inner/outer/inner double check pattern to avoid race conditions when checking for
-    # bad state
-    # inner = index_attempt in non terminal state
-    # outer = r.fence_key down
-
-    # check the db for index attempts in a non terminal state
-    attempts: list[IndexAttempt] = []
-    attempts.extend(
-        get_all_index_attempts_by_status(IndexingStatus.NOT_STARTED, db_session)
-    )
-    attempts.extend(
-        get_all_index_attempts_by_status(IndexingStatus.IN_PROGRESS, db_session)
-    )
-
-    for attempt in attempts:
-        fence_key = RedisConnectorIndex.fence_key_with_ids(
-            attempt.connector_credential_pair_id, attempt.search_settings_id
-        )
-
-        # if the fence is down / doesn't exist, possible error but not confirmed
-        if r.exists(fence_key):
-            continue
-
-        # Between the time the attempts are first looked up and the time we see the fence down,
-        # the attempt may have completed and taken down the fence normally.
-
-        # We need to double check that the index attempt is still in a non terminal state
-        # and matches the original state, which confirms we are really in a bad state.
-        attempt_2 = get_index_attempt(db_session, attempt.id)
-        if not attempt_2:
-            continue
-
-        if attempt.status != attempt_2.status:
-            continue
-
-        unfenced_attempts.append(attempt.id)
-
-    return unfenced_attempts
-
-
 class IndexingCallbackBase(IndexingHeartbeatInterface):
     PARENT_CHECK_INTERVAL = 60
 
@@ -123,10 +63,9 @@ def __init__(
         self.last_parent_check = time.monotonic()
 
     def should_stop(self) -> bool:
-        if self.redis_connector.stop.fenced:
-            return True
-
-        return False
+        # Check if the associated indexing attempt has been cancelled
+        # TODO: Pass index_attempt_id to the callback and check cancellation using the db
+        return bool(self.redis_connector.stop.fenced)
 
     def progress(self, tag: str, amount: int) -> None:
         """Amount isn't used yet."""
@@ -178,179 +117,16 @@ def __init__(
         redis_connector: RedisConnector,
         redis_lock: RedisLock,
         redis_client: Redis,
-        redis_connector_index: RedisConnectorIndex,
     ):
         super().__init__(parent_pid, redis_connector, redis_lock, redis_client)
 
-        self.redis_connector_index: RedisConnectorIndex = redis_connector_index
-
     def progress(self, tag: str, amount: int) -> None:
-        self.redis_connector_index.set_active()
-        self.redis_connector_index.set_connector_active()
         super().progress(tag, amount)
-        self.redis_client.incrby(
-            self.redis_connector_index.generator_progress_key, amount
-        )
-
-
-def validate_indexing_fence(
-    tenant_id: str,
-    key_bytes: bytes,
-    reserved_tasks: set[str],
-    r_celery: Redis,
-    db_session: Session,
-) -> None:
-    """Checks for the error condition where an indexing fence is set but the associated celery tasks don't exist.
-    This can happen if the indexing worker hard crashes or is terminated.
-    Being in this bad state means the fence will never clear without help, so this function
-    gives the help.
-
-    How this works:
-    1. This function renews the active signal with a 5 minute TTL under the following conditions
-    1.2. When the task is seen in the redis queue
-    1.3. When the task is seen in the reserved / prefetched list
-
-    2. Externally, the active signal is renewed when:
-    2.1. The fence is created
-    2.2. The indexing watchdog checks the spawned task.
-
-    3. The TTL allows us to get through the transitions on fence startup
-    and when the task starts executing.
-
-    More TTL clarification: it is seemingly impossible to exactly query Celery for
-    whether a task is in the queue or currently executing.
-    1. An unknown task id is always returned as state PENDING.
-    2. Redis can be inspected for the task id, but the task id is gone between the time a worker receives the task
-    and the time it actually starts on the worker.
-    """
-    # if the fence doesn't exist, there's nothing to do
-    fence_key = key_bytes.decode("utf-8")
-    composite_id = RedisConnector.get_id_from_fence_key(fence_key)
-    if composite_id is None:
-        task_logger.warning(
-            f"validate_indexing_fence - could not parse composite_id from {fence_key}"
-        )
-        return
-
-    # parse out metadata and initialize the helper class with it
-    parts = composite_id.split("/")
-    if len(parts) != 2:
-        return
-
-    cc_pair_id = int(parts[0])
-    search_settings_id = int(parts[1])
-
-    redis_connector = RedisConnector(tenant_id, cc_pair_id)
-    redis_connector_index = redis_connector.new_index(search_settings_id)
-
-    # check to see if the fence/payload exists
-    if not redis_connector_index.fenced:
-        return
-
-    payload = redis_connector_index.payload
-    if not payload:
-        return
-
-    # OK, there's actually something for us to validate
-
-    if payload.celery_task_id is None:
-        # the fence is just barely set up.
-        if redis_connector_index.active():
-            return
 
-        # it would be odd to get here as there isn't that much that can go wrong during
-        # initial fence setup, but it's still worth making sure we can recover
-        logger.info(
-            f"validate_indexing_fence - "
-            f"Resetting fence in basic state without any activity: fence={fence_key}"
-        )
-        redis_connector_index.reset()
-        return
-
-    found = celery_find_task(
-        payload.celery_task_id, OnyxCeleryQueues.CONNECTOR_INDEXING, r_celery
-    )
-    if found:
-        # the celery task exists in the redis queue
-        redis_connector_index.set_active()
-        return
-
-    if payload.celery_task_id in reserved_tasks:
-        # the celery task was prefetched and is reserved within the indexing worker
-        redis_connector_index.set_active()
-        return
-
-    # we may want to enable this check if using the active task list somehow isn't good enough
-    # if redis_connector_index.generator_locked():
-    #     logger.info(f"{payload.celery_task_id} is currently executing.")
-
-    # if we get here, we didn't find any direct indication that the associated celery tasks exist,
-    # but they still might be there due to gaps in our ability to check states during transitions
-    # Checking the active signal safeguards us against these transition periods
-    # (which has a duration that allows us to bridge those gaps)
-    if redis_connector_index.active():
-        return
-
-    # celery tasks don't exist and the active signal has expired, possibly due to a crash. Clean it up.
-    logger.warning(
-        f"validate_indexing_fence - Resetting fence because no associated celery tasks were found: "
-        f"index_attempt={payload.index_attempt_id} "
-        f"cc_pair={cc_pair_id} "
-        f"search_settings={search_settings_id} "
-        f"fence={fence_key}"
-    )
-    if payload.index_attempt_id:
-        try:
-            mark_attempt_failed(
-                payload.index_attempt_id,
-                db_session,
-                "validate_indexing_fence - Canceling index attempt due to missing celery tasks: "
-                f"index_attempt={payload.index_attempt_id}",
-            )
-        except Exception:
-            logger.exception(
-                "validate_indexing_fence - Exception while marking index attempt as failed: "
-                f"index_attempt={payload.index_attempt_id}",
-            )
-
-    redis_connector_index.reset()
-    return
-
-
-def validate_indexing_fences(
-    tenant_id: str,
-    r_replica: Redis,
-    r_celery: Redis,
-    lock_beat: RedisLock,
-) -> None:
-    """Validates all indexing fences for this tenant ... aka makes sure
-    indexing tasks sent to celery are still in flight.
-    """
-    reserved_indexing_tasks = celery_get_unacked_task_ids(
-        OnyxCeleryQueues.CONNECTOR_INDEXING, r_celery
-    )
-
-    # Use replica for this because the worst thing that happens
-    # is that we don't run the validation on this pass
-    keys = cast(set[Any], r_replica.smembers(OnyxRedisConstants.ACTIVE_FENCES))
-    for key in keys:
-        key_bytes = cast(bytes, key)
-        key_str = key_bytes.decode("utf-8")
-        if not key_str.startswith(RedisConnectorIndex.FENCE_PREFIX):
-            continue
-
-        with get_session_with_current_tenant() as db_session:
-            validate_indexing_fence(
-                tenant_id,
-                key_bytes,
-                reserved_indexing_tasks,
-                r_celery,
-                db_session,
-            )
-
-        lock_beat.reacquire()
 
-    return
+# NOTE: The validate_indexing_fence and validate_indexing_fences functions have been removed
+# as they are no longer needed with database-based coordination. The new validation is
+# handled by validate_active_indexing_attempts in the main indexing tasks module.
 
 
 def is_in_repeated_error_state(
@@ -414,10 +190,12 @@ def should_index(
     )
 
     # uncomment for debugging
-    # task_logger.info(f"_should_index: "
-    #                  f"cc_pair={cc_pair.id} "
-    #                  f"connector={cc_pair.connector_id} "
-    #                  f"refresh_freq={connector.refresh_freq}")
+    task_logger.info(
+        f"_should_index: "
+        f"cc_pair={cc_pair.id} "
+        f"connector={cc_pair.connector_id} "
+        f"refresh_freq={connector.refresh_freq}"
+    )
 
     # don't kick off indexing for `NOT_APPLICABLE` sources
     if connector.source == DocumentSource.NOT_APPLICABLE:
@@ -517,7 +295,7 @@ def should_index(
     return True
 
 
-def try_creating_indexing_task(
+def try_creating_docfetching_task(
     celery_app: Celery,
     cc_pair: ConnectorCredentialPair,
     search_settings: SearchSettings,
@@ -531,10 +309,11 @@ def try_creating_indexing_task(
 
     Does not check for scheduling related conditions as this function
     is used to trigger indexing immediately.
+
+    Now uses database-based coordination instead of Redis fencing.
     """
 
     LOCK_TIMEOUT = 30
-    index_attempt_id: int | None = None
 
     # we need to serialize any attempt to trigger indexing since it can be triggered
     # either via celery beat or manually (API call)
@@ -547,61 +326,42 @@ def try_creating_indexing_task(
     if not acquired:
         return None
 
-    redis_connector_index: RedisConnectorIndex
+    index_attempt_id = None
     try:
-        redis_connector = RedisConnector(tenant_id, cc_pair.id)
-        redis_connector_index = redis_connector.new_index(search_settings.id)
-
-        # skip if already indexing
-        if redis_connector_index.fenced:
-            return None
-
-        # skip indexing if the cc_pair is deleting
-        if redis_connector.delete.fenced:
-            return None
-
+        # Basic status checks
         db_session.refresh(cc_pair)
         if cc_pair.status == ConnectorCredentialPairStatus.DELETING:
             return None
 
-        # add a long running generator task to the queue
-        redis_connector_index.generator_clear()
-
-        # set a basic fence to start
-        payload = RedisConnectorIndexPayload(
-            index_attempt_id=None,
-            started=None,
-            submitted=datetime.now(timezone.utc),
-            celery_task_id=None,
-        )
+        # Generate custom task ID for tracking
+        custom_task_id = f"docfetching_{cc_pair.id}_{search_settings.id}_{uuid4()}"
 
-        redis_connector_index.set_active()
-        redis_connector_index.set_fence(payload)
-
-        # create the index attempt for tracking purposes
-        # code elsewhere checks for index attempts without an associated redis key
-        # and cleans them up
-        # therefore we must create the attempt and the task after the fence goes up
-        index_attempt_id = create_index_attempt(
-            cc_pair.id,
-            search_settings.id,
-            from_beginning=reindex,
+        # Try to create a new index attempt using database coordination
+        # This replaces the Redis fencing mechanism
+        index_attempt_id = IndexingCoordination.try_create_index_attempt(
             db_session=db_session,
+            cc_pair_id=cc_pair.id,
+            search_settings_id=search_settings.id,
+            celery_task_id=custom_task_id,
+            from_beginning=reindex,
         )
 
-        custom_task_id = redis_connector_index.generate_generator_task_id()
+        if index_attempt_id is None:
+            # Another indexing attempt is already running
+            return None
 
         # Determine which queue to use based on whether this is a user file
+        # TODO: at the moment the indexing pipeline is
+        # shared between user files and connectors
         queue = (
             OnyxCeleryQueues.USER_FILES_INDEXING
             if cc_pair.is_user_file
-            else OnyxCeleryQueues.CONNECTOR_INDEXING
+            else OnyxCeleryQueues.CONNECTOR_DOC_FETCHING
         )
 
-        # when the task is sent, we have yet to finish setting up the fence
-        # therefore, the task must contain code that blocks until the fence is ready
+        # Send the task to Celery
         result = celery_app.send_task(
-            OnyxCeleryTask.CONNECTOR_INDEXING_PROXY_TASK,
+            OnyxCeleryTask.CONNECTOR_DOC_FETCHING_TASK,
             kwargs=dict(
                 index_attempt_id=index_attempt_id,
                 cc_pair_id=cc_pair.id,
@@ -613,14 +373,18 @@ def try_creating_indexing_task(
             priority=OnyxCeleryPriority.MEDIUM,
         )
         if not result:
-            raise RuntimeError("send_task for connector_indexing_proxy_task failed.")
+            raise RuntimeError("send_task for connector_doc_fetching_task failed.")
 
-        # now fill out the fence with the rest of the data
-        redis_connector_index.set_active()
+        task_logger.info(
+            f"Created docfetching task: "
+            f"cc_pair={cc_pair.id} "
+            f"search_settings={search_settings.id} "
+            f"attempt_id={index_attempt_id} "
+            f"celery_task_id={custom_task_id}"
+        )
+
+        return index_attempt_id
 
-        payload.index_attempt_id = index_attempt_id
-        payload.celery_task_id = result.id
-        redis_connector_index.set_fence(payload)
     except Exception:
         task_logger.exception(
             f"try_creating_indexing_task - Unexpected exception: "
@@ -628,9 +392,10 @@ def try_creating_indexing_task(
             f"search_settings={search_settings.id}"
         )
 
+        # Clean up on failure
         if index_attempt_id is not None:
-            delete_index_attempt(db_session, index_attempt_id)
-        redis_connector_index.set_fence(None)
+            mark_attempt_failed(index_attempt_id, db_session)
+
         return None
     finally:
         if lock.owned():
diff --git a/backend/onyx/background/celery/tasks/indexing/tasks.py b/backend/onyx/background/celery/tasks/indexing/tasks.py
deleted file mode 100644
index c911271f154..00000000000
--- a/backend/onyx/background/celery/tasks/indexing/tasks.py
+++ /dev/null
@@ -1,1424 +0,0 @@
-import multiprocessing
-import os
-import time
-import traceback
-from datetime import datetime
-from datetime import timezone
-from enum import Enum
-from http import HTTPStatus
-from time import sleep
-from typing import Any
-from typing import cast
-
-import sentry_sdk
-from celery import shared_task
-from celery import Task
-from celery.exceptions import SoftTimeLimitExceeded
-from celery.result import AsyncResult
-from celery.states import READY_STATES
-from pydantic import BaseModel
-from redis import Redis
-from redis.lock import Lock as RedisLock
-from sqlalchemy.orm import Session
-
-from onyx.background.celery.apps.app_base import task_logger
-from onyx.background.celery.celery_utils import httpx_init_vespa_pool
-from onyx.background.celery.memory_monitoring import emit_process_memory
-from onyx.background.celery.tasks.beat_schedule import CLOUD_BEAT_MULTIPLIER_DEFAULT
-from onyx.background.celery.tasks.indexing.utils import get_unfenced_index_attempt_ids
-from onyx.background.celery.tasks.indexing.utils import IndexingCallback
-from onyx.background.celery.tasks.indexing.utils import is_in_repeated_error_state
-from onyx.background.celery.tasks.indexing.utils import should_index
-from onyx.background.celery.tasks.indexing.utils import try_creating_indexing_task
-from onyx.background.celery.tasks.indexing.utils import validate_indexing_fences
-from onyx.background.indexing.checkpointing_utils import cleanup_checkpoint
-from onyx.background.indexing.checkpointing_utils import (
-    get_index_attempts_with_old_checkpoints,
-)
-from onyx.background.indexing.job_client import SimpleJob
-from onyx.background.indexing.job_client import SimpleJobClient
-from onyx.background.indexing.job_client import SimpleJobException
-from onyx.background.indexing.run_indexing import run_indexing_entrypoint
-from onyx.configs.app_configs import MANAGED_VESPA
-from onyx.configs.app_configs import VESPA_CLOUD_CERT_PATH
-from onyx.configs.app_configs import VESPA_CLOUD_KEY_PATH
-from onyx.configs.constants import CELERY_GENERIC_BEAT_LOCK_TIMEOUT
-from onyx.configs.constants import CELERY_INDEXING_LOCK_TIMEOUT
-from onyx.configs.constants import CELERY_INDEXING_WATCHDOG_CONNECTOR_TIMEOUT
-from onyx.configs.constants import CELERY_TASK_WAIT_FOR_FENCE_TIMEOUT
-from onyx.configs.constants import OnyxCeleryPriority
-from onyx.configs.constants import OnyxCeleryQueues
-from onyx.configs.constants import OnyxCeleryTask
-from onyx.configs.constants import OnyxRedisConstants
-from onyx.configs.constants import OnyxRedisLocks
-from onyx.configs.constants import OnyxRedisSignals
-from onyx.connectors.exceptions import ConnectorValidationError
-from onyx.db.connector import mark_ccpair_with_indexing_trigger
-from onyx.db.connector_credential_pair import ConnectorType
-from onyx.db.connector_credential_pair import (
-    fetch_indexable_connector_credential_pair_ids,
-)
-from onyx.db.connector_credential_pair import get_connector_credential_pair_from_id
-from onyx.db.connector_credential_pair import set_cc_pair_repeated_error_state
-from onyx.db.engine.sql_engine import get_session_with_current_tenant
-from onyx.db.enums import ConnectorCredentialPairStatus
-from onyx.db.enums import IndexingMode
-from onyx.db.enums import IndexingStatus
-from onyx.db.index_attempt import get_index_attempt
-from onyx.db.index_attempt import mark_attempt_canceled
-from onyx.db.index_attempt import mark_attempt_failed
-from onyx.db.search_settings import get_active_search_settings_list
-from onyx.db.search_settings import get_current_search_settings
-from onyx.db.swap_index import check_and_perform_index_swap
-from onyx.natural_language_processing.search_nlp_models import EmbeddingModel
-from onyx.natural_language_processing.search_nlp_models import warm_up_bi_encoder
-from onyx.redis.redis_connector import RedisConnector
-from onyx.redis.redis_connector_index import RedisConnectorIndex
-from onyx.redis.redis_pool import get_redis_client
-from onyx.redis.redis_pool import get_redis_replica_client
-from onyx.redis.redis_pool import redis_lock_dump
-from onyx.redis.redis_pool import SCAN_ITER_COUNT_DEFAULT
-from onyx.redis.redis_utils import is_fence
-from onyx.server.runtime.onyx_runtime import OnyxRuntime
-from onyx.utils.logger import setup_logger
-from onyx.utils.variable_functionality import global_version
-from shared_configs.configs import INDEXING_MODEL_SERVER_HOST
-from shared_configs.configs import INDEXING_MODEL_SERVER_PORT
-from shared_configs.configs import MULTI_TENANT
-from shared_configs.configs import SENTRY_DSN
-
-logger = setup_logger()
-
-USER_FILE_INDEXING_LIMIT = 100
-
-
-def _get_fence_validation_block_expiration() -> int:
-    """
-    Compute the expiration time for the fence validation block signal.
-    Base expiration is 60 seconds, multiplied by the beat multiplier only in MULTI_TENANT mode.
-    """
-    base_expiration = 60  # seconds
-
-    if not MULTI_TENANT:
-        return base_expiration
-
-    try:
-        beat_multiplier = OnyxRuntime.get_beat_multiplier()
-    except Exception:
-        beat_multiplier = CLOUD_BEAT_MULTIPLIER_DEFAULT
-
-    return int(base_expiration * beat_multiplier)
-
-
-class IndexingWatchdogTerminalStatus(str, Enum):
-    """The different statuses the watchdog can finish with.
-
-    TODO: create broader success/failure/abort categories
-    """
-
-    UNDEFINED = "undefined"
-
-    SUCCEEDED = "succeeded"
-
-    SPAWN_FAILED = "spawn_failed"  # connector spawn failed
-    SPAWN_NOT_ALIVE = (
-        "spawn_not_alive"  # spawn succeeded but process did not come alive
-    )
-
-    BLOCKED_BY_DELETION = "blocked_by_deletion"
-    BLOCKED_BY_STOP_SIGNAL = "blocked_by_stop_signal"
-    FENCE_NOT_FOUND = "fence_not_found"  # fence does not exist
-    FENCE_READINESS_TIMEOUT = (
-        "fence_readiness_timeout"  # fence exists but wasn't ready within the timeout
-    )
-    FENCE_MISMATCH = "fence_mismatch"  # task and fence metadata mismatch
-    TASK_ALREADY_RUNNING = "task_already_running"  # task appears to be running already
-    INDEX_ATTEMPT_MISMATCH = (
-        "index_attempt_mismatch"  # expected index attempt metadata not found in db
-    )
-
-    CONNECTOR_VALIDATION_ERROR = (
-        "connector_validation_error"  # the connector validation failed
-    )
-    CONNECTOR_EXCEPTIONED = "connector_exceptioned"  # the connector itself exceptioned
-    WATCHDOG_EXCEPTIONED = "watchdog_exceptioned"  # the watchdog exceptioned
-
-    # the watchdog received a termination signal
-    TERMINATED_BY_SIGNAL = "terminated_by_signal"
-
-    # the watchdog terminated the task due to no activity
-    TERMINATED_BY_ACTIVITY_TIMEOUT = "terminated_by_activity_timeout"
-
-    # NOTE: this may actually be the same as SIGKILL, but parsed differently by python
-    # consolidate once we know more
-    OUT_OF_MEMORY = "out_of_memory"
-
-    PROCESS_SIGNAL_SIGKILL = "process_signal_sigkill"
-
-    @property
-    def code(self) -> int:
-        _ENUM_TO_CODE: dict[IndexingWatchdogTerminalStatus, int] = {
-            IndexingWatchdogTerminalStatus.PROCESS_SIGNAL_SIGKILL: -9,
-            IndexingWatchdogTerminalStatus.OUT_OF_MEMORY: 137,
-            IndexingWatchdogTerminalStatus.CONNECTOR_VALIDATION_ERROR: 247,
-            IndexingWatchdogTerminalStatus.BLOCKED_BY_DELETION: 248,
-            IndexingWatchdogTerminalStatus.BLOCKED_BY_STOP_SIGNAL: 249,
-            IndexingWatchdogTerminalStatus.FENCE_NOT_FOUND: 250,
-            IndexingWatchdogTerminalStatus.FENCE_READINESS_TIMEOUT: 251,
-            IndexingWatchdogTerminalStatus.FENCE_MISMATCH: 252,
-            IndexingWatchdogTerminalStatus.TASK_ALREADY_RUNNING: 253,
-            IndexingWatchdogTerminalStatus.INDEX_ATTEMPT_MISMATCH: 254,
-            IndexingWatchdogTerminalStatus.CONNECTOR_EXCEPTIONED: 255,
-        }
-
-        return _ENUM_TO_CODE[self]
-
-    @classmethod
-    def from_code(cls, code: int) -> "IndexingWatchdogTerminalStatus":
-        _CODE_TO_ENUM: dict[int, IndexingWatchdogTerminalStatus] = {
-            -9: IndexingWatchdogTerminalStatus.PROCESS_SIGNAL_SIGKILL,
-            137: IndexingWatchdogTerminalStatus.OUT_OF_MEMORY,
-            247: IndexingWatchdogTerminalStatus.CONNECTOR_VALIDATION_ERROR,
-            248: IndexingWatchdogTerminalStatus.BLOCKED_BY_DELETION,
-            249: IndexingWatchdogTerminalStatus.BLOCKED_BY_STOP_SIGNAL,
-            250: IndexingWatchdogTerminalStatus.FENCE_NOT_FOUND,
-            251: IndexingWatchdogTerminalStatus.FENCE_READINESS_TIMEOUT,
-            252: IndexingWatchdogTerminalStatus.FENCE_MISMATCH,
-            253: IndexingWatchdogTerminalStatus.TASK_ALREADY_RUNNING,
-            254: IndexingWatchdogTerminalStatus.INDEX_ATTEMPT_MISMATCH,
-            255: IndexingWatchdogTerminalStatus.CONNECTOR_EXCEPTIONED,
-        }
-
-        if code in _CODE_TO_ENUM:
-            return _CODE_TO_ENUM[code]
-
-        return IndexingWatchdogTerminalStatus.UNDEFINED
-
-
-class SimpleJobResult:
-    """The data we want to have when the watchdog finishes"""
-
-    def __init__(self) -> None:
-        self.status = IndexingWatchdogTerminalStatus.UNDEFINED
-        self.connector_source = None
-        self.exit_code = None
-        self.exception_str = None
-
-    status: IndexingWatchdogTerminalStatus
-    connector_source: str | None
-    exit_code: int | None
-    exception_str: str | None
-
-
-class ConnectorIndexingContext(BaseModel):
-    tenant_id: str
-    cc_pair_id: int
-    search_settings_id: int
-    index_attempt_id: int
-
-
-class ConnectorIndexingLogBuilder:
-    def __init__(self, ctx: ConnectorIndexingContext):
-        self.ctx = ctx
-
-    def build(self, msg: str, **kwargs: Any) -> str:
-        msg_final = (
-            f"{msg}: "
-            f"tenant_id={self.ctx.tenant_id} "
-            f"attempt={self.ctx.index_attempt_id} "
-            f"cc_pair={self.ctx.cc_pair_id} "
-            f"search_settings={self.ctx.search_settings_id}"
-        )
-
-        # Append extra keyword arguments in logfmt style
-        if kwargs:
-            extra_logfmt = " ".join(f"{key}={value}" for key, value in kwargs.items())
-            msg_final = f"{msg_final} {extra_logfmt}"
-
-        return msg_final
-
-
-def monitor_ccpair_indexing_taskset(
-    tenant_id: str, key_bytes: bytes, r: Redis, db_session: Session
-) -> None:
-    # if the fence doesn't exist, there's nothing to do
-    fence_key = key_bytes.decode("utf-8")
-    composite_id = RedisConnector.get_id_from_fence_key(fence_key)
-    if composite_id is None:
-        task_logger.warning(
-            f"Connector indexing: could not parse composite_id from {fence_key}"
-        )
-        return
-
-    # parse out metadata and initialize the helper class with it
-    parts = composite_id.split("/")
-    if len(parts) != 2:
-        return
-
-    cc_pair_id = int(parts[0])
-    search_settings_id = int(parts[1])
-
-    redis_connector = RedisConnector(tenant_id, cc_pair_id)
-    redis_connector_index = redis_connector.new_index(search_settings_id)
-    if not redis_connector_index.fenced:
-        return
-
-    payload = redis_connector_index.payload
-    if not payload:
-        return
-
-    # if the CC Pair is `SCHEDULED`, moved it to `INITIAL_INDEXING`. A CC Pair
-    # should only ever be `SCHEDULED` if it's a new connector.
-    cc_pair = get_connector_credential_pair_from_id(db_session, cc_pair_id)
-    if cc_pair is None:
-        raise RuntimeError(f"CC Pair {cc_pair_id} not found")
-
-    if cc_pair.status == ConnectorCredentialPairStatus.SCHEDULED:
-        cc_pair.status = ConnectorCredentialPairStatus.INITIAL_INDEXING
-        db_session.commit()
-
-    elapsed_started_str = None
-    if payload.started:
-        elapsed_started = datetime.now(timezone.utc) - payload.started
-        elapsed_started_str = f"{elapsed_started.total_seconds():.2f}"
-
-    elapsed_submitted = datetime.now(timezone.utc) - payload.submitted
-
-    progress = redis_connector_index.get_progress()
-    if progress is not None:
-        task_logger.info(
-            f"Connector indexing progress: "
-            f"attempt={payload.index_attempt_id} "
-            f"cc_pair={cc_pair_id} "
-            f"search_settings={search_settings_id} "
-            f"progress={progress} "
-            f"elapsed_submitted={elapsed_submitted.total_seconds():.2f} "
-            f"elapsed_started={elapsed_started_str}"
-        )
-
-    if payload.index_attempt_id is None or payload.celery_task_id is None:
-        # the task is still setting up
-        return
-
-    # never use any blocking methods on the result from inside a task!
-    result: AsyncResult = AsyncResult(payload.celery_task_id)
-
-    # inner/outer/inner double check pattern to avoid race conditions when checking for
-    # bad state
-
-    # Verify: if the generator isn't complete, the task must not be in READY state
-    # inner = get_completion / generator_complete not signaled
-    # outer = result.state in READY state
-    status_int = redis_connector_index.get_completion()
-    if status_int is None:  # inner signal not set ... possible error
-        task_state = result.state
-        if (
-            task_state in READY_STATES
-        ):  # outer signal in terminal state ... possible error
-            # Now double check!
-            if redis_connector_index.get_completion() is None:
-                # inner signal still not set (and cannot change when outer result_state is READY)
-                # Task is finished but generator complete isn't set.
-                # We have a problem! Worker may have crashed.
-                task_result = str(result.result)
-                task_traceback = str(result.traceback)
-
-                msg = (
-                    f"Connector indexing aborted or exceptioned: "
-                    f"attempt={payload.index_attempt_id} "
-                    f"celery_task={payload.celery_task_id} "
-                    f"cc_pair={cc_pair_id} "
-                    f"search_settings={search_settings_id} "
-                    f"elapsed_submitted={elapsed_submitted.total_seconds():.2f} "
-                    f"result.state={task_state} "
-                    f"result.result={task_result} "
-                    f"result.traceback={task_traceback}"
-                )
-                task_logger.warning(msg)
-
-                try:
-                    index_attempt = get_index_attempt(
-                        db_session, payload.index_attempt_id
-                    )
-                    if index_attempt:
-                        if (
-                            index_attempt.status != IndexingStatus.CANCELED
-                            and index_attempt.status != IndexingStatus.FAILED
-                        ):
-                            mark_attempt_failed(
-                                index_attempt_id=payload.index_attempt_id,
-                                db_session=db_session,
-                                failure_reason=msg,
-                            )
-                except Exception:
-                    task_logger.exception(
-                        "Connector indexing - Transient exception marking index attempt as failed: "
-                        f"attempt={payload.index_attempt_id} "
-                        f"tenant={tenant_id} "
-                        f"cc_pair={cc_pair_id} "
-                        f"search_settings={search_settings_id}"
-                    )
-
-                redis_connector_index.reset()
-        return
-
-    if redis_connector_index.watchdog_signaled():
-        # if the generator is complete, don't clean up until the watchdog has exited
-        task_logger.info(
-            f"Connector indexing - Delaying finalization until watchdog has exited: "
-            f"attempt={payload.index_attempt_id} "
-            f"cc_pair={cc_pair_id} "
-            f"search_settings={search_settings_id} "
-            f"progress={progress} "
-            f"elapsed_submitted={elapsed_submitted.total_seconds():.2f} "
-            f"elapsed_started={elapsed_started_str}"
-        )
-
-        return
-
-    status_enum = HTTPStatus(status_int)
-
-    task_logger.info(
-        f"Connector indexing finished: "
-        f"attempt={payload.index_attempt_id} "
-        f"cc_pair={cc_pair_id} "
-        f"search_settings={search_settings_id} "
-        f"progress={progress} "
-        f"status={status_enum.name} "
-        f"elapsed_submitted={elapsed_submitted.total_seconds():.2f} "
-        f"elapsed_started={elapsed_started_str}"
-    )
-
-    redis_connector_index.reset()
-
-    # mark the CC Pair as `ACTIVE` if the attempt was a success and the
-    # CC Pair is not active not already
-    # This should never technically be in this state, but we'll handle it anyway
-    index_attempt = get_index_attempt(db_session, payload.index_attempt_id)
-    index_attempt_is_successful = index_attempt and index_attempt.status.is_successful()
-    if (
-        index_attempt_is_successful
-        and cc_pair.status == ConnectorCredentialPairStatus.SCHEDULED
-        or cc_pair.status == ConnectorCredentialPairStatus.INITIAL_INDEXING
-    ):
-        cc_pair.status = ConnectorCredentialPairStatus.ACTIVE
-        db_session.commit()
-
-    # if the index attempt is successful, clear the repeated error state
-    if cc_pair.in_repeated_error_state and index_attempt_is_successful:
-        cc_pair.in_repeated_error_state = False
-        db_session.commit()
-
-
-@shared_task(
-    name=OnyxCeleryTask.CHECK_FOR_INDEXING,
-    soft_time_limit=300,
-    bind=True,
-)
-def check_for_indexing(self: Task, *, tenant_id: str) -> int | None:
-    """a lightweight task used to kick off indexing tasks.
-    Occcasionally does some validation of existing state to clear up error conditions"""
-
-    time_start = time.monotonic()
-    task_logger.warning("check_for_indexing - Starting")
-
-    tasks_created = 0
-    locked = False
-    redis_client = get_redis_client()
-    redis_client_replica = get_redis_replica_client()
-
-    # we need to use celery's redis client to access its redis data
-    # (which lives on a different db number)
-    redis_client_celery: Redis = self.app.broker_connection().channel().client  # type: ignore
-
-    lock_beat: RedisLock = redis_client.lock(
-        OnyxRedisLocks.CHECK_INDEXING_BEAT_LOCK,
-        timeout=CELERY_GENERIC_BEAT_LOCK_TIMEOUT,
-    )
-
-    # these tasks should never overlap
-    if not lock_beat.acquire(blocking=False):
-        return None
-
-    try:
-        locked = True
-
-        # SPECIAL 0/3: sync lookup table for active fences
-        # we want to run this less frequently than the overall task
-        if not redis_client.exists(OnyxRedisSignals.BLOCK_BUILD_FENCE_LOOKUP_TABLE):
-            # build a lookup table of existing fences
-            # this is just a migration concern and should be unnecessary once
-            # lookup tables are rolled out
-            for key_bytes in redis_client_replica.scan_iter(
-                count=SCAN_ITER_COUNT_DEFAULT
-            ):
-                if is_fence(key_bytes) and not redis_client.sismember(
-                    OnyxRedisConstants.ACTIVE_FENCES, key_bytes
-                ):
-                    logger.warning(f"Adding {key_bytes} to the lookup table.")
-                    redis_client.sadd(OnyxRedisConstants.ACTIVE_FENCES, key_bytes)
-
-            redis_client.set(
-                OnyxRedisSignals.BLOCK_BUILD_FENCE_LOOKUP_TABLE,
-                1,
-                ex=OnyxRuntime.get_build_fence_lookup_table_interval(),
-            )
-
-        # 1/3: KICKOFF
-
-        # check for search settings swap
-        with get_session_with_current_tenant() as db_session:
-            old_search_settings = check_and_perform_index_swap(db_session=db_session)
-            current_search_settings = get_current_search_settings(db_session)
-            # So that the first time users aren't surprised by really slow speed of first
-            # batch of documents indexed
-            if current_search_settings.provider_type is None and not MULTI_TENANT:
-                if old_search_settings:
-                    embedding_model = EmbeddingModel.from_db_model(
-                        search_settings=current_search_settings,
-                        server_host=INDEXING_MODEL_SERVER_HOST,
-                        server_port=INDEXING_MODEL_SERVER_PORT,
-                    )
-
-                    # only warm up if search settings were changed
-                    warm_up_bi_encoder(
-                        embedding_model=embedding_model,
-                    )
-
-        # gather cc_pair_ids + current search settings
-        lock_beat.reacquire()
-        with get_session_with_current_tenant() as db_session:
-            standard_cc_pair_ids = fetch_indexable_connector_credential_pair_ids(
-                db_session, connector_type=ConnectorType.STANDARD
-            )
-            # only index 50 user files at a time. This makes sense since user files are
-            # indexed only once, and then they are done. In practice, we would rarely
-            # have more than `USER_FILE_INDEXING_LIMIT` user files to index.
-            user_file_cc_pair_ids = fetch_indexable_connector_credential_pair_ids(
-                db_session,
-                connector_type=ConnectorType.USER_FILE,
-                limit=USER_FILE_INDEXING_LIMIT,
-            )
-            cc_pair_ids = standard_cc_pair_ids + user_file_cc_pair_ids
-
-            # NOTE: some potential race conditions here, but the worse case is
-            # kicking off some "invalid" indexing tasks which will just fail
-            search_settings_list = get_active_search_settings_list(db_session)
-
-        current_search_settings = next(
-            search_settings_instance
-            for search_settings_instance in search_settings_list
-            if search_settings_instance.status.is_current()
-        )
-
-        # mark CC Pairs that are repeatedly failing as in repeated error state
-        with get_session_with_current_tenant() as db_session:
-            for cc_pair_id in cc_pair_ids:
-                lock_beat.reacquire()
-
-                if is_in_repeated_error_state(
-                    cc_pair_id=cc_pair_id,
-                    search_settings_id=current_search_settings.id,
-                    db_session=db_session,
-                ):
-                    set_cc_pair_repeated_error_state(
-                        db_session=db_session,
-                        cc_pair_id=cc_pair_id,
-                        in_repeated_error_state=True,
-                    )
-
-        # kick off index attempts
-        for cc_pair_id in cc_pair_ids:
-            lock_beat.reacquire()
-
-            redis_connector = RedisConnector(tenant_id, cc_pair_id)
-            with get_session_with_current_tenant() as db_session:
-                for search_settings_instance in search_settings_list:
-                    # skip non-live search settings that don't have background reindex enabled
-                    # those should just auto-change to live shortly after creation without
-                    # requiring any indexing till that point
-                    if (
-                        not search_settings_instance.status.is_current()
-                        and not search_settings_instance.background_reindex_enabled
-                    ):
-                        task_logger.warning("SKIPPING DUE TO NON-LIVE SEARCH SETTINGS")
-
-                        continue
-
-                    redis_connector_index = redis_connector.new_index(
-                        search_settings_instance.id
-                    )
-                    if redis_connector_index.fenced:
-                        task_logger.debug(
-                            f"check_for_indexing - Skipping fenced connector: "
-                            f"cc_pair={cc_pair_id} search_settings={search_settings_instance.id}"
-                        )
-                        continue
-
-                    cc_pair = get_connector_credential_pair_from_id(
-                        db_session=db_session,
-                        cc_pair_id=cc_pair_id,
-                    )
-                    if not cc_pair:
-                        task_logger.warning(
-                            f"check_for_indexing - CC pair not found: cc_pair={cc_pair_id}"
-                        )
-                        continue
-
-                    if not should_index(
-                        cc_pair=cc_pair,
-                        search_settings_instance=search_settings_instance,
-                        secondary_index_building=len(search_settings_list) > 1,
-                        db_session=db_session,
-                    ):
-                        task_logger.debug(
-                            f"check_for_indexing - Not indexing cc_pair_id: {cc_pair_id} "
-                            f"search_settings={search_settings_instance.id}, "
-                            f"secondary_index_building={len(search_settings_list) > 1}"
-                        )
-                        continue
-                    else:
-                        task_logger.debug(
-                            f"check_for_indexing - Will index cc_pair_id: {cc_pair_id} "
-                            f"search_settings={search_settings_instance.id}, "
-                            f"secondary_index_building={len(search_settings_list) > 1}"
-                        )
-
-                    reindex = False
-                    if search_settings_instance.status.is_current():
-                        # the indexing trigger is only checked and cleared with the current search settings
-                        if cc_pair.indexing_trigger is not None:
-                            if cc_pair.indexing_trigger == IndexingMode.REINDEX:
-                                reindex = True
-
-                            task_logger.info(
-                                f"Connector indexing manual trigger detected: "
-                                f"cc_pair={cc_pair.id} "
-                                f"search_settings={search_settings_instance.id} "
-                                f"indexing_mode={cc_pair.indexing_trigger}"
-                            )
-
-                            mark_ccpair_with_indexing_trigger(
-                                cc_pair.id, None, db_session
-                            )
-
-                    # using a task queue and only allowing one task per cc_pair/search_setting
-                    # prevents us from starving out certain attempts
-                    attempt_id = try_creating_indexing_task(
-                        self.app,
-                        cc_pair,
-                        search_settings_instance,
-                        reindex,
-                        db_session,
-                        redis_client,
-                        tenant_id,
-                    )
-                    if attempt_id:
-                        task_logger.info(
-                            f"Connector indexing queued: "
-                            f"index_attempt={attempt_id} "
-                            f"cc_pair={cc_pair.id} "
-                            f"search_settings={search_settings_instance.id}"
-                        )
-                        tasks_created += 1
-                    else:
-                        task_logger.info(
-                            f"Failed to create indexing task: "
-                            f"cc_pair={cc_pair.id} "
-                            f"search_settings={search_settings_instance.id}"
-                        )
-
-        lock_beat.reacquire()
-
-        # 2/3: VALIDATE
-
-        # Fail any index attempts in the DB that don't have fences
-        # This shouldn't ever happen!
-        with get_session_with_current_tenant() as db_session:
-            unfenced_attempt_ids = get_unfenced_index_attempt_ids(
-                db_session, redis_client
-            )
-
-            for attempt_id in unfenced_attempt_ids:
-                lock_beat.reacquire()
-
-                attempt = get_index_attempt(db_session, attempt_id)
-                if not attempt:
-                    continue
-
-                failure_reason = (
-                    f"Unfenced index attempt found in DB: "
-                    f"index_attempt={attempt.id} "
-                    f"cc_pair={attempt.connector_credential_pair_id} "
-                    f"search_settings={attempt.search_settings_id}"
-                )
-                task_logger.error(failure_reason)
-                mark_attempt_failed(
-                    attempt.id, db_session, failure_reason=failure_reason
-                )
-
-        lock_beat.reacquire()
-        # we want to run this less frequently than the overall task
-        if not redis_client.exists(OnyxRedisSignals.BLOCK_VALIDATE_INDEXING_FENCES):
-            # clear any indexing fences that don't have associated celery tasks in progress
-            # tasks can be in the queue in redis, in reserved tasks (prefetched by the worker),
-            # or be currently executing
-            try:
-                validate_indexing_fences(
-                    tenant_id, redis_client_replica, redis_client_celery, lock_beat
-                )
-            except Exception:
-                task_logger.exception("Exception while validating indexing fences")
-
-            redis_client.set(
-                OnyxRedisSignals.BLOCK_VALIDATE_INDEXING_FENCES,
-                1,
-                ex=_get_fence_validation_block_expiration(),
-            )
-
-        # 3/3: FINALIZE
-        lock_beat.reacquire()
-        keys = cast(
-            set[Any], redis_client_replica.smembers(OnyxRedisConstants.ACTIVE_FENCES)
-        )
-        for key in keys:
-            key_bytes = cast(bytes, key)
-
-            if not redis_client.exists(key_bytes):
-                redis_client.srem(OnyxRedisConstants.ACTIVE_FENCES, key_bytes)
-                continue
-
-            key_str = key_bytes.decode("utf-8")
-            if key_str.startswith(RedisConnectorIndex.FENCE_PREFIX):
-                with get_session_with_current_tenant() as db_session:
-                    monitor_ccpair_indexing_taskset(
-                        tenant_id, key_bytes, redis_client_replica, db_session
-                    )
-
-    except SoftTimeLimitExceeded:
-        task_logger.info(
-            "Soft time limit exceeded, task is being terminated gracefully."
-        )
-    except Exception:
-        task_logger.exception("Unexpected exception during indexing check")
-    finally:
-        if locked:
-            if lock_beat.owned():
-                lock_beat.release()
-            else:
-                task_logger.error(
-                    "check_for_indexing - Lock not owned on completion: "
-                    f"tenant={tenant_id}"
-                )
-                redis_lock_dump(lock_beat, redis_client)
-
-    time_elapsed = time.monotonic() - time_start
-    task_logger.info(f"check_for_indexing finished: elapsed={time_elapsed:.2f}")
-    return tasks_created
-
-
-def connector_indexing_task(
-    index_attempt_id: int,
-    cc_pair_id: int,
-    search_settings_id: int,
-    is_ee: bool,
-    tenant_id: str,
-) -> int | None:
-    """Indexing task. For a cc pair, this task pulls all document IDs from the source
-    and compares those IDs to locally stored documents and deletes all locally stored IDs missing
-    from the most recently pulled document ID list
-
-    acks_late must be set to False. Otherwise, celery's visibility timeout will
-    cause any task that runs longer than the timeout to be redispatched by the broker.
-    There appears to be no good workaround for this, so we need to handle redispatching
-    manually.
-
-    Returns None if the task did not run (possibly due to a conflict).
-    Otherwise, returns an int >= 0 representing the number of indexed docs.
-
-    NOTE: if an exception is raised out of this task, the primary worker will detect
-    that the task transitioned to a "READY" state but the generator_complete_key doesn't exist.
-    This will cause the primary worker to abort the indexing attempt and clean up.
-    """
-
-    # Since connector_indexing_proxy_task spawns a new process using this function as
-    # the entrypoint, we init Sentry here.
-    if SENTRY_DSN:
-        sentry_sdk.init(
-            dsn=SENTRY_DSN,
-            traces_sample_rate=0.1,
-        )
-        logger.info("Sentry initialized")
-    else:
-        logger.debug("Sentry DSN not provided, skipping Sentry initialization")
-
-    logger.info(
-        f"Indexing spawned task starting: "
-        f"attempt={index_attempt_id} "
-        f"tenant={tenant_id} "
-        f"cc_pair={cc_pair_id} "
-        f"search_settings={search_settings_id}"
-    )
-
-    n_final_progress: int | None = None
-
-    # 20 is the documented default for httpx max_keepalive_connections
-    if MANAGED_VESPA:
-        httpx_init_vespa_pool(
-            20, ssl_cert=VESPA_CLOUD_CERT_PATH, ssl_key=VESPA_CLOUD_KEY_PATH
-        )
-    else:
-        httpx_init_vespa_pool(20)
-
-    redis_connector = RedisConnector(tenant_id, cc_pair_id)
-    redis_connector_index = redis_connector.new_index(search_settings_id)
-
-    r = get_redis_client()
-
-    if redis_connector.delete.fenced:
-        raise SimpleJobException(
-            f"Indexing will not start because connector deletion is in progress: "
-            f"attempt={index_attempt_id} "
-            f"cc_pair={cc_pair_id} "
-            f"fence={redis_connector.delete.fence_key}",
-            code=IndexingWatchdogTerminalStatus.BLOCKED_BY_DELETION.code,
-        )
-
-    if redis_connector.stop.fenced:
-        raise SimpleJobException(
-            f"Indexing will not start because a connector stop signal was detected: "
-            f"attempt={index_attempt_id} "
-            f"cc_pair={cc_pair_id} "
-            f"fence={redis_connector.stop.fence_key}",
-            code=IndexingWatchdogTerminalStatus.BLOCKED_BY_STOP_SIGNAL.code,
-        )
-
-    # this wait is needed to avoid a race condition where
-    # the primary worker sends the task and it is immediately executed
-    # before the primary worker can finalize the fence
-    start = time.monotonic()
-    while True:
-        if time.monotonic() - start > CELERY_TASK_WAIT_FOR_FENCE_TIMEOUT:
-            raise SimpleJobException(
-                f"connector_indexing_task - timed out waiting for fence to be ready: "
-                f"fence={redis_connector.permissions.fence_key}",
-                code=IndexingWatchdogTerminalStatus.FENCE_READINESS_TIMEOUT.code,
-            )
-
-        if not redis_connector_index.fenced:  # The fence must exist
-            raise SimpleJobException(
-                f"connector_indexing_task - fence not found: fence={redis_connector_index.fence_key}",
-                code=IndexingWatchdogTerminalStatus.FENCE_NOT_FOUND.code,
-            )
-
-        payload = redis_connector_index.payload  # The payload must exist
-        if not payload:
-            raise SimpleJobException(
-                "connector_indexing_task: payload invalid or not found",
-                code=IndexingWatchdogTerminalStatus.FENCE_NOT_FOUND.code,
-            )
-
-        if payload.index_attempt_id is None or payload.celery_task_id is None:
-            logger.info(
-                f"connector_indexing_task - Waiting for fence: fence={redis_connector_index.fence_key}"
-            )
-            sleep(1)
-            continue
-
-        if payload.index_attempt_id != index_attempt_id:
-            raise SimpleJobException(
-                f"connector_indexing_task - id mismatch. Task may be left over from previous run.: "
-                f"task_index_attempt={index_attempt_id} "
-                f"payload_index_attempt={payload.index_attempt_id}",
-                code=IndexingWatchdogTerminalStatus.FENCE_MISMATCH.code,
-            )
-
-        logger.info(
-            f"connector_indexing_task - Fence found, continuing...: fence={redis_connector_index.fence_key}"
-        )
-        break
-
-    # set thread_local=False since we don't control what thread the indexing/pruning
-    # might run our callback with
-    lock: RedisLock = r.lock(
-        redis_connector_index.generator_lock_key,
-        timeout=CELERY_INDEXING_LOCK_TIMEOUT,
-        thread_local=False,
-    )
-
-    acquired = lock.acquire(blocking=False)
-    if not acquired:
-        logger.warning(
-            f"Indexing task already running, exiting...: "
-            f"index_attempt={index_attempt_id} "
-            f"cc_pair={cc_pair_id} "
-            f"search_settings={search_settings_id}"
-        )
-
-        raise SimpleJobException(
-            f"Indexing task already running, exiting...: "
-            f"index_attempt={index_attempt_id} "
-            f"cc_pair={cc_pair_id} "
-            f"search_settings={search_settings_id}",
-            code=IndexingWatchdogTerminalStatus.TASK_ALREADY_RUNNING.code,
-        )
-
-    payload.started = datetime.now(timezone.utc)
-    redis_connector_index.set_fence(payload)
-
-    try:
-        with get_session_with_current_tenant() as db_session:
-            attempt = get_index_attempt(db_session, index_attempt_id)
-            if not attempt:
-                raise SimpleJobException(
-                    f"Index attempt not found: index_attempt={index_attempt_id}",
-                    code=IndexingWatchdogTerminalStatus.INDEX_ATTEMPT_MISMATCH.code,
-                )
-
-            cc_pair = get_connector_credential_pair_from_id(
-                db_session=db_session,
-                cc_pair_id=cc_pair_id,
-            )
-
-            if not cc_pair:
-                raise SimpleJobException(
-                    f"cc_pair not found: cc_pair={cc_pair_id}",
-                    code=IndexingWatchdogTerminalStatus.INDEX_ATTEMPT_MISMATCH.code,
-                )
-
-            if not cc_pair.connector:
-                raise SimpleJobException(
-                    f"Connector not found: cc_pair={cc_pair_id} connector={cc_pair.connector_id}",
-                    code=IndexingWatchdogTerminalStatus.INDEX_ATTEMPT_MISMATCH.code,
-                )
-
-            if not cc_pair.credential:
-                raise SimpleJobException(
-                    f"Credential not found: cc_pair={cc_pair_id} credential={cc_pair.credential_id}",
-                    code=IndexingWatchdogTerminalStatus.INDEX_ATTEMPT_MISMATCH.code,
-                )
-
-        # define a callback class
-        callback = IndexingCallback(
-            os.getppid(),
-            redis_connector,
-            lock,
-            r,
-            redis_connector_index,
-        )
-
-        logger.info(
-            f"Indexing spawned task running entrypoint: attempt={index_attempt_id} "
-            f"tenant={tenant_id} "
-            f"cc_pair={cc_pair_id} "
-            f"search_settings={search_settings_id}"
-        )
-
-        # This is where the heavy/real work happens
-        run_indexing_entrypoint(
-            index_attempt_id,
-            tenant_id,
-            cc_pair_id,
-            is_ee,
-            callback=callback,
-        )
-
-        # get back the total number of indexed docs and return it
-        n_final_progress = redis_connector_index.get_progress()
-        redis_connector_index.set_generator_complete(HTTPStatus.OK.value)
-    except ConnectorValidationError:
-        raise SimpleJobException(
-            f"Indexing task failed: attempt={index_attempt_id} "
-            f"tenant={tenant_id} "
-            f"cc_pair={cc_pair_id} "
-            f"search_settings={search_settings_id}",
-            code=IndexingWatchdogTerminalStatus.CONNECTOR_VALIDATION_ERROR.code,
-        )
-
-    except Exception as e:
-        logger.exception(
-            f"Indexing spawned task failed: attempt={index_attempt_id} "
-            f"tenant={tenant_id} "
-            f"cc_pair={cc_pair_id} "
-            f"search_settings={search_settings_id}"
-        )
-
-        # special bulletproofing ... truncate long exception messages
-        # for exception types that require more args, this will fail
-        # thus the try/except
-        try:
-            sanitized_e = type(e)(str(e)[:1024])
-            sanitized_e.__traceback__ = e.__traceback__
-            raise sanitized_e
-        except Exception:
-            raise e
-
-    finally:
-        if lock.owned():
-            lock.release()
-
-    logger.info(
-        f"Indexing spawned task finished: attempt={index_attempt_id} "
-        f"cc_pair={cc_pair_id} "
-        f"search_settings={search_settings_id}"
-    )
-    return n_final_progress
-
-
-def process_job_result(
-    job: SimpleJob,
-    connector_source: str | None,
-    redis_connector_index: RedisConnectorIndex,
-    log_builder: ConnectorIndexingLogBuilder,
-) -> SimpleJobResult:
-    result = SimpleJobResult()
-    result.connector_source = connector_source
-
-    if job.process:
-        result.exit_code = job.process.exitcode
-
-    if job.status != "error":
-        result.status = IndexingWatchdogTerminalStatus.SUCCEEDED
-        return result
-
-    ignore_exitcode = False
-
-    # In EKS, there is an edge case where successful tasks return exit
-    # code 1 in the cloud due to the set_spawn_method not sticking.
-    # We've since worked around this, but the following is a safe way to
-    # work around this issue. Basically, we ignore the job error state
-    # if the completion signal is OK.
-    status_int = redis_connector_index.get_completion()
-    if status_int:
-        status_enum = HTTPStatus(status_int)
-        if status_enum == HTTPStatus.OK:
-            ignore_exitcode = True
-
-    if ignore_exitcode:
-        result.status = IndexingWatchdogTerminalStatus.SUCCEEDED
-        task_logger.warning(
-            log_builder.build(
-                "Indexing watchdog - spawned task has non-zero exit code "
-                "but completion signal is OK. Continuing...",
-                exit_code=str(result.exit_code),
-            )
-        )
-    else:
-        if result.exit_code is not None:
-            result.status = IndexingWatchdogTerminalStatus.from_code(result.exit_code)
-
-        result.exception_str = job.exception()
-
-    return result
-
-
-@shared_task(
-    name=OnyxCeleryTask.CONNECTOR_INDEXING_PROXY_TASK,
-    bind=True,
-    acks_late=False,
-    track_started=True,
-)
-def connector_indexing_proxy_task(
-    self: Task,
-    index_attempt_id: int,
-    cc_pair_id: int,
-    search_settings_id: int,
-    tenant_id: str,
-) -> None:
-    """celery out of process task execution strategy is pool=prefork, but it uses fork,
-    and forking is inherently unstable.
-
-    To work around this, we use pool=threads and proxy our work to a spawned task.
-
-    TODO(rkuo): refactor this so that there is a single return path where we canonically
-    log the result of running this function.
-
-    NOTE: we try/except all db access in this function because as a watchdog, this function
-    needs to be extremely stable.
-    """
-    start = time.monotonic()
-
-    result = SimpleJobResult()
-
-    ctx = ConnectorIndexingContext(
-        tenant_id=tenant_id,
-        cc_pair_id=cc_pair_id,
-        search_settings_id=search_settings_id,
-        index_attempt_id=index_attempt_id,
-    )
-
-    log_builder = ConnectorIndexingLogBuilder(ctx)
-
-    task_logger.info(
-        log_builder.build(
-            "Indexing watchdog - starting",
-            mp_start_method=str(multiprocessing.get_start_method()),
-        )
-    )
-
-    if not self.request.id:
-        task_logger.error("self.request.id is None!")
-
-    client = SimpleJobClient()
-    task_logger.info(f"submitting connector_indexing_task with tenant_id={tenant_id}")
-
-    job = client.submit(
-        connector_indexing_task,
-        index_attempt_id,
-        cc_pair_id,
-        search_settings_id,
-        global_version.is_ee_version(),
-        tenant_id,
-    )
-
-    if not job or not job.process:
-        result.status = IndexingWatchdogTerminalStatus.SPAWN_FAILED
-        task_logger.info(
-            log_builder.build(
-                "Indexing watchdog - finished",
-                status=str(result.status.value),
-                exit_code=str(result.exit_code),
-            )
-        )
-        return
-
-    # Ensure the process has moved out of the starting state
-    num_waits = 0
-    while True:
-        if num_waits > 15:
-            result.status = IndexingWatchdogTerminalStatus.SPAWN_NOT_ALIVE
-            task_logger.info(
-                log_builder.build(
-                    "Indexing watchdog - finished",
-                    status=str(result.status.value),
-                    exit_code=str(result.exit_code),
-                )
-            )
-            job.release()
-            return
-
-        if job.process.is_alive() or job.process.exitcode is not None:
-            break
-
-        sleep(1)
-        num_waits += 1
-
-    task_logger.info(
-        log_builder.build(
-            "Indexing watchdog - spawn succeeded",
-            pid=str(job.process.pid),
-        )
-    )
-
-    redis_connector = RedisConnector(tenant_id, cc_pair_id)
-    redis_connector_index = redis_connector.new_index(search_settings_id)
-
-    # Track the last time memory info was emitted
-    last_memory_emit_time = 0.0
-
-    # track the last ttl and the time it was observed
-    last_activity_ttl_observed: float = time.monotonic()
-    last_activity_ttl: int = 0
-
-    try:
-        with get_session_with_current_tenant() as db_session:
-            index_attempt = get_index_attempt(
-                db_session=db_session, index_attempt_id=index_attempt_id
-            )
-            if not index_attempt:
-                raise RuntimeError("Index attempt not found")
-
-            result.connector_source = (
-                index_attempt.connector_credential_pair.connector.source.value
-            )
-
-        redis_connector_index.set_active()  # renew active signal
-
-        # prime the connector active signal (renewed inside the connector)
-        redis_connector_index.set_connector_active()
-
-        while True:
-            sleep(5)
-
-            now = time.monotonic()
-
-            # renew watchdog signal (this has a shorter timeout than set_active)
-            redis_connector_index.set_watchdog(True)
-
-            # renew active signal
-            redis_connector_index.set_active()
-
-            # if the job is done, clean up and break
-            if job.done():
-                try:
-                    result = process_job_result(
-                        job, result.connector_source, redis_connector_index, log_builder
-                    )
-                except Exception:
-                    task_logger.exception(
-                        log_builder.build(
-                            "Indexing watchdog - spawned task exceptioned"
-                        )
-                    )
-                finally:
-                    job.release()
-                    break
-
-            # log the memory usage for tracking down memory leaks / connector-specific memory issues
-            pid = job.process.pid
-            if pid is not None:
-                # Only emit memory info once per minute (60 seconds)
-                current_time = time.monotonic()
-                if current_time - last_memory_emit_time >= 60.0:
-                    emit_process_memory(
-                        pid,
-                        "indexing_worker",
-                        {
-                            "cc_pair_id": cc_pair_id,
-                            "search_settings_id": search_settings_id,
-                            "index_attempt_id": index_attempt_id,
-                        },
-                    )
-                    last_memory_emit_time = current_time
-
-            # if a termination signal is detected, break (exit point will clean up)
-            if self.request.id and redis_connector_index.terminating(self.request.id):
-                task_logger.warning(
-                    log_builder.build("Indexing watchdog - termination signal detected")
-                )
-
-                result.status = IndexingWatchdogTerminalStatus.TERMINATED_BY_SIGNAL
-                break
-
-            # if activity timeout is detected, break (exit point will clean up)
-            ttl = redis_connector_index.connector_active_ttl()
-            if ttl < 0:
-                # verify expectations around ttl
-                last_observed = last_activity_ttl_observed - now
-                if now > last_activity_ttl_observed + last_activity_ttl:
-                    task_logger.warning(
-                        log_builder.build(
-                            "Indexing watchdog - activity timeout exceeded",
-                            last_observed=f"{last_observed:.2f}s",
-                            last_ttl=f"{last_activity_ttl}",
-                            timeout=f"{CELERY_INDEXING_WATCHDOG_CONNECTOR_TIMEOUT}s",
-                        )
-                    )
-
-                    result.status = (
-                        IndexingWatchdogTerminalStatus.TERMINATED_BY_ACTIVITY_TIMEOUT
-                    )
-                    break
-                else:
-                    task_logger.warning(
-                        log_builder.build(
-                            "Indexing watchdog - activity timeout expired unexpectedly, "
-                            "waiting for last observed TTL before exiting",
-                            last_observed=f"{last_observed:.2f}s",
-                            last_ttl=f"{last_activity_ttl}",
-                            timeout=f"{CELERY_INDEXING_WATCHDOG_CONNECTOR_TIMEOUT}s",
-                        )
-                    )
-            else:
-                last_activity_ttl_observed = now
-                last_activity_ttl = ttl
-
-            # if the spawned task is still running, restart the check once again
-            # if the index attempt is not in a finished status
-            try:
-                with get_session_with_current_tenant() as db_session:
-                    index_attempt = get_index_attempt(
-                        db_session=db_session, index_attempt_id=index_attempt_id
-                    )
-
-                    if not index_attempt:
-                        continue
-
-                    if not index_attempt.is_finished():
-                        continue
-
-            except Exception:
-                task_logger.exception(
-                    log_builder.build(
-                        "Indexing watchdog - transient exception looking up index attempt"
-                    )
-                )
-                continue
-
-    except Exception as e:
-        result.status = IndexingWatchdogTerminalStatus.WATCHDOG_EXCEPTIONED
-        if isinstance(e, ConnectorValidationError):
-            # No need to expose full stack trace for validation errors
-            result.exception_str = str(e)
-        else:
-            result.exception_str = traceback.format_exc()
-
-    # handle exit and reporting
-    elapsed = time.monotonic() - start
-    if result.exception_str is not None:
-        # print with exception
-        try:
-            with get_session_with_current_tenant() as db_session:
-                failure_reason = (
-                    f"Spawned task exceptioned: exit_code={result.exit_code}"
-                )
-                mark_attempt_failed(
-                    ctx.index_attempt_id,
-                    db_session,
-                    failure_reason=failure_reason,
-                    full_exception_trace=result.exception_str,
-                )
-        except Exception:
-            task_logger.exception(
-                log_builder.build(
-                    "Indexing watchdog - transient exception marking index attempt as failed"
-                )
-            )
-
-        normalized_exception_str = "None"
-        if result.exception_str:
-            normalized_exception_str = result.exception_str.replace(
-                "\n", "\\n"
-            ).replace('"', '\\"')
-
-        task_logger.warning(
-            log_builder.build(
-                "Indexing watchdog - finished",
-                source=result.connector_source,
-                status=result.status.value,
-                exit_code=str(result.exit_code),
-                exception=f'"{normalized_exception_str}"',
-                elapsed=f"{elapsed:.2f}s",
-            )
-        )
-
-        redis_connector_index.set_watchdog(False)
-        raise RuntimeError(f"Exception encountered: traceback={result.exception_str}")
-
-    # print without exception
-    if result.status == IndexingWatchdogTerminalStatus.TERMINATED_BY_SIGNAL:
-        try:
-            with get_session_with_current_tenant() as db_session:
-                logger.exception(
-                    f"Marking attempt {index_attempt_id} as canceled due to termination signal"
-                )
-                mark_attempt_canceled(
-                    index_attempt_id,
-                    db_session,
-                    "Connector termination signal detected",
-                )
-        except Exception:
-            task_logger.exception(
-                log_builder.build(
-                    "Indexing watchdog - transient exception marking index attempt as canceled"
-                )
-            )
-
-        job.cancel()
-    elif result.status == IndexingWatchdogTerminalStatus.TERMINATED_BY_ACTIVITY_TIMEOUT:
-        try:
-            with get_session_with_current_tenant() as db_session:
-                mark_attempt_failed(
-                    index_attempt_id,
-                    db_session,
-                    "Indexing watchdog - activity timeout exceeded: "
-                    f"attempt={index_attempt_id} "
-                    f"timeout={CELERY_INDEXING_WATCHDOG_CONNECTOR_TIMEOUT}s",
-                )
-        except Exception:
-            logger.exception(
-                log_builder.build(
-                    "Indexing watchdog - transient exception marking index attempt as failed"
-                )
-            )
-        job.cancel()
-    else:
-        pass
-
-    task_logger.info(
-        log_builder.build(
-            "Indexing watchdog - finished",
-            source=result.connector_source,
-            status=str(result.status.value),
-            exit_code=str(result.exit_code),
-            elapsed=f"{elapsed:.2f}s",
-        )
-    )
-
-    redis_connector_index.set_watchdog(False)
-    return
-
-
-# primary
-@shared_task(
-    name=OnyxCeleryTask.CHECK_FOR_CHECKPOINT_CLEANUP,
-    soft_time_limit=300,
-    bind=True,
-)
-def check_for_checkpoint_cleanup(self: Task, *, tenant_id: str) -> None:
-    """Clean up old checkpoints that are older than 7 days."""
-    locked = False
-    redis_client = get_redis_client(tenant_id=tenant_id)
-    lock: RedisLock = redis_client.lock(
-        OnyxRedisLocks.CHECK_CHECKPOINT_CLEANUP_BEAT_LOCK,
-        timeout=CELERY_GENERIC_BEAT_LOCK_TIMEOUT,
-    )
-
-    # these tasks should never overlap
-    if not lock.acquire(blocking=False):
-        return None
-
-    try:
-        locked = True
-        with get_session_with_current_tenant() as db_session:
-            old_attempts = get_index_attempts_with_old_checkpoints(db_session)
-            for attempt in old_attempts:
-                task_logger.info(
-                    f"Cleaning up checkpoint for index attempt {attempt.id}"
-                )
-                self.app.send_task(
-                    OnyxCeleryTask.CLEANUP_CHECKPOINT,
-                    kwargs={
-                        "index_attempt_id": attempt.id,
-                        "tenant_id": tenant_id,
-                    },
-                    queue=OnyxCeleryQueues.CHECKPOINT_CLEANUP,
-                    priority=OnyxCeleryPriority.MEDIUM,
-                )
-    except Exception:
-        task_logger.exception("Unexpected exception during checkpoint cleanup")
-        return None
-    finally:
-        if locked:
-            if lock.owned():
-                lock.release()
-            else:
-                task_logger.error(
-                    "check_for_checkpoint_cleanup - Lock not owned on completion: "
-                    f"tenant={tenant_id}"
-                )
-
-
-# light worker
-@shared_task(
-    name=OnyxCeleryTask.CLEANUP_CHECKPOINT,
-    bind=True,
-)
-def cleanup_checkpoint_task(
-    self: Task, *, index_attempt_id: int, tenant_id: str | None
-) -> None:
-    """Clean up a checkpoint for a given index attempt"""
-
-    start = time.monotonic()
-
-    try:
-        with get_session_with_current_tenant() as db_session:
-            cleanup_checkpoint(db_session, index_attempt_id)
-    finally:
-        elapsed = time.monotonic() - start
-
-        task_logger.info(
-            f"cleanup_checkpoint_task completed: tenant_id={tenant_id} "
-            f"index_attempt_id={index_attempt_id} "
-            f"elapsed={elapsed:.2f}"
-        )
diff --git a/backend/onyx/background/celery/tasks/models.py b/backend/onyx/background/celery/tasks/models.py
new file mode 100644
index 00000000000..851abe73241
--- /dev/null
+++ b/backend/onyx/background/celery/tasks/models.py
@@ -0,0 +1,110 @@
+from enum import Enum
+
+from pydantic import BaseModel
+
+
+class DocProcessingContext(BaseModel):
+    tenant_id: str
+    cc_pair_id: int
+    search_settings_id: int
+    index_attempt_id: int
+
+
+class IndexingWatchdogTerminalStatus(str, Enum):
+    """The different statuses the watchdog can finish with.
+
+    TODO: create broader success/failure/abort categories
+    """
+
+    UNDEFINED = "undefined"
+
+    SUCCEEDED = "succeeded"
+
+    SPAWN_FAILED = "spawn_failed"  # connector spawn failed
+    SPAWN_NOT_ALIVE = (
+        "spawn_not_alive"  # spawn succeeded but process did not come alive
+    )
+
+    BLOCKED_BY_DELETION = "blocked_by_deletion"
+    BLOCKED_BY_STOP_SIGNAL = "blocked_by_stop_signal"
+    FENCE_NOT_FOUND = "fence_not_found"  # fence does not exist
+    FENCE_READINESS_TIMEOUT = (
+        "fence_readiness_timeout"  # fence exists but wasn't ready within the timeout
+    )
+    FENCE_MISMATCH = "fence_mismatch"  # task and fence metadata mismatch
+    TASK_ALREADY_RUNNING = "task_already_running"  # task appears to be running already
+    INDEX_ATTEMPT_MISMATCH = (
+        "index_attempt_mismatch"  # expected index attempt metadata not found in db
+    )
+
+    CONNECTOR_VALIDATION_ERROR = (
+        "connector_validation_error"  # the connector validation failed
+    )
+    CONNECTOR_EXCEPTIONED = "connector_exceptioned"  # the connector itself exceptioned
+    WATCHDOG_EXCEPTIONED = "watchdog_exceptioned"  # the watchdog exceptioned
+
+    # the watchdog received a termination signal
+    TERMINATED_BY_SIGNAL = "terminated_by_signal"
+
+    # the watchdog terminated the task due to no activity
+    TERMINATED_BY_ACTIVITY_TIMEOUT = "terminated_by_activity_timeout"
+
+    # NOTE: this may actually be the same as SIGKILL, but parsed differently by python
+    # consolidate once we know more
+    OUT_OF_MEMORY = "out_of_memory"
+
+    PROCESS_SIGNAL_SIGKILL = "process_signal_sigkill"
+
+    @property
+    def code(self) -> int:
+        _ENUM_TO_CODE: dict[IndexingWatchdogTerminalStatus, int] = {
+            IndexingWatchdogTerminalStatus.PROCESS_SIGNAL_SIGKILL: -9,
+            IndexingWatchdogTerminalStatus.OUT_OF_MEMORY: 137,
+            IndexingWatchdogTerminalStatus.CONNECTOR_VALIDATION_ERROR: 247,
+            IndexingWatchdogTerminalStatus.BLOCKED_BY_DELETION: 248,
+            IndexingWatchdogTerminalStatus.BLOCKED_BY_STOP_SIGNAL: 249,
+            IndexingWatchdogTerminalStatus.FENCE_NOT_FOUND: 250,
+            IndexingWatchdogTerminalStatus.FENCE_READINESS_TIMEOUT: 251,
+            IndexingWatchdogTerminalStatus.FENCE_MISMATCH: 252,
+            IndexingWatchdogTerminalStatus.TASK_ALREADY_RUNNING: 253,
+            IndexingWatchdogTerminalStatus.INDEX_ATTEMPT_MISMATCH: 254,
+            IndexingWatchdogTerminalStatus.CONNECTOR_EXCEPTIONED: 255,
+        }
+
+        return _ENUM_TO_CODE[self]
+
+    @classmethod
+    def from_code(cls, code: int) -> "IndexingWatchdogTerminalStatus":
+        _CODE_TO_ENUM: dict[int, IndexingWatchdogTerminalStatus] = {
+            -9: IndexingWatchdogTerminalStatus.PROCESS_SIGNAL_SIGKILL,
+            137: IndexingWatchdogTerminalStatus.OUT_OF_MEMORY,
+            247: IndexingWatchdogTerminalStatus.CONNECTOR_VALIDATION_ERROR,
+            248: IndexingWatchdogTerminalStatus.BLOCKED_BY_DELETION,
+            249: IndexingWatchdogTerminalStatus.BLOCKED_BY_STOP_SIGNAL,
+            250: IndexingWatchdogTerminalStatus.FENCE_NOT_FOUND,
+            251: IndexingWatchdogTerminalStatus.FENCE_READINESS_TIMEOUT,
+            252: IndexingWatchdogTerminalStatus.FENCE_MISMATCH,
+            253: IndexingWatchdogTerminalStatus.TASK_ALREADY_RUNNING,
+            254: IndexingWatchdogTerminalStatus.INDEX_ATTEMPT_MISMATCH,
+            255: IndexingWatchdogTerminalStatus.CONNECTOR_EXCEPTIONED,
+        }
+
+        if code in _CODE_TO_ENUM:
+            return _CODE_TO_ENUM[code]
+
+        return IndexingWatchdogTerminalStatus.UNDEFINED
+
+
+class SimpleJobResult:
+    """The data we want to have when the watchdog finishes"""
+
+    def __init__(self) -> None:
+        self.status = IndexingWatchdogTerminalStatus.UNDEFINED
+        self.connector_source = None
+        self.exit_code = None
+        self.exception_str = None
+
+    status: IndexingWatchdogTerminalStatus
+    connector_source: str | None
+    exit_code: int | None
+    exception_str: str | None
diff --git a/backend/onyx/background/celery/tasks/monitoring/tasks.py b/backend/onyx/background/celery/tasks/monitoring/tasks.py
index b006e49b48f..1662e31ea51 100644
--- a/backend/onyx/background/celery/tasks/monitoring/tasks.py
+++ b/backend/onyx/background/celery/tasks/monitoring/tasks.py
@@ -147,7 +147,7 @@ def _collect_queue_metrics(redis_celery: Redis) -> list[Metric]:
     metrics = []
     queue_mappings = {
         "celery_queue_length": "celery",
-        "indexing_queue_length": "indexing",
+        "docprocessing_queue_length": "docprocessing",
         "sync_queue_length": "sync",
         "deletion_queue_length": "deletion",
         "pruning_queue_length": "pruning",
@@ -882,7 +882,13 @@ def monitor_celery_queues_helper(
 
     r_celery = task.app.broker_connection().channel().client  # type: ignore
     n_celery = celery_get_queue_length("celery", r_celery)
-    n_indexing = celery_get_queue_length(OnyxCeleryQueues.CONNECTOR_INDEXING, r_celery)
+    n_docfetching = celery_get_queue_length(
+        OnyxCeleryQueues.CONNECTOR_DOC_FETCHING, r_celery
+    )
+    n_docprocessing = celery_get_queue_length(OnyxCeleryQueues.DOCPROCESSING, r_celery)
+    n_user_files_indexing = celery_get_queue_length(
+        OnyxCeleryQueues.USER_FILES_INDEXING, r_celery
+    )
     n_sync = celery_get_queue_length(OnyxCeleryQueues.VESPA_METADATA_SYNC, r_celery)
     n_deletion = celery_get_queue_length(OnyxCeleryQueues.CONNECTOR_DELETION, r_celery)
     n_pruning = celery_get_queue_length(OnyxCeleryQueues.CONNECTOR_PRUNING, r_celery)
@@ -896,14 +902,20 @@ def monitor_celery_queues_helper(
         OnyxCeleryQueues.DOC_PERMISSIONS_UPSERT, r_celery
     )
 
-    n_indexing_prefetched = celery_get_unacked_task_ids(
-        OnyxCeleryQueues.CONNECTOR_INDEXING, r_celery
+    n_docfetching_prefetched = celery_get_unacked_task_ids(
+        OnyxCeleryQueues.CONNECTOR_DOC_FETCHING, r_celery
+    )
+    n_docprocessing_prefetched = celery_get_unacked_task_ids(
+        OnyxCeleryQueues.DOCPROCESSING, r_celery
     )
 
     task_logger.info(
         f"Queue lengths: celery={n_celery} "
-        f"indexing={n_indexing} "
-        f"indexing_prefetched={len(n_indexing_prefetched)} "
+        f"docfetching={n_docfetching} "
+        f"docfetching_prefetched={len(n_docfetching_prefetched)} "
+        f"docprocessing={n_docprocessing} "
+        f"docprocessing_prefetched={len(n_docprocessing_prefetched)} "
+        f"user_files_indexing={n_user_files_indexing} "
         f"sync={n_sync} "
         f"deletion={n_deletion} "
         f"pruning={n_pruning} "
diff --git a/backend/onyx/background/celery/tasks/pruning/tasks.py b/backend/onyx/background/celery/tasks/pruning/tasks.py
index 885df705da6..a84fb19f702 100644
--- a/backend/onyx/background/celery/tasks/pruning/tasks.py
+++ b/backend/onyx/background/celery/tasks/pruning/tasks.py
@@ -22,7 +22,7 @@
 from onyx.background.celery.celery_redis import celery_get_unacked_task_ids
 from onyx.background.celery.celery_utils import extract_ids_from_runnable_connector
 from onyx.background.celery.tasks.beat_schedule import CLOUD_BEAT_MULTIPLIER_DEFAULT
-from onyx.background.celery.tasks.indexing.utils import IndexingCallbackBase
+from onyx.background.celery.tasks.docprocessing.utils import IndexingCallbackBase
 from onyx.configs.app_configs import ALLOW_SIMULTANEOUS_PRUNING
 from onyx.configs.app_configs import JOB_TIMEOUT
 from onyx.configs.constants import CELERY_GENERIC_BEAT_LOCK_TIMEOUT
@@ -464,7 +464,7 @@ def connector_pruning_generator_task(
     # set thread_local=False since we don't control what thread the indexing/pruning
     # might run our callback with
     lock: RedisLock = r.lock(
-        OnyxRedisLocks.PRUNING_LOCK_PREFIX + f"_{redis_connector.id}",
+        OnyxRedisLocks.PRUNING_LOCK_PREFIX + f"_{redis_connector.cc_pair_id}",
         timeout=CELERY_PRUNING_LOCK_TIMEOUT,
         thread_local=False,
     )
diff --git a/backend/onyx/background/celery/versioned_apps/indexing.py b/backend/onyx/background/celery/versioned_apps/docfetching.py
similarity index 86%
rename from backend/onyx/background/celery/versioned_apps/indexing.py
rename to backend/onyx/background/celery/versioned_apps/docfetching.py
index 2b5e09fa8b6..00828a79804 100644
--- a/backend/onyx/background/celery/versioned_apps/indexing.py
+++ b/backend/onyx/background/celery/versioned_apps/docfetching.py
@@ -10,7 +10,7 @@
 
 
 def get_app() -> Celery:
-    from onyx.background.celery.apps.indexing import celery_app
+    from onyx.background.celery.apps.docfetching import celery_app
 
     return celery_app
 
diff --git a/backend/onyx/background/celery/versioned_apps/docprocessing.py b/backend/onyx/background/celery/versioned_apps/docprocessing.py
new file mode 100644
index 00000000000..acdf550c893
--- /dev/null
+++ b/backend/onyx/background/celery/versioned_apps/docprocessing.py
@@ -0,0 +1,18 @@
+"""Factory stub for running celery worker / celery beat.
+This code is different from the primary/beat stubs because there is no EE version to
+fetch. Port over the code in those files if we add an EE version of this worker."""
+
+from celery import Celery
+
+from onyx.utils.variable_functionality import set_is_ee_based_on_env_variable
+
+set_is_ee_based_on_env_variable()
+
+
+def get_app() -> Celery:
+    from onyx.background.celery.apps.docprocessing import celery_app
+
+    return celery_app
+
+
+app = get_app()
diff --git a/backend/onyx/background/indexing/checkpointing_utils.py b/backend/onyx/background/indexing/checkpointing_utils.py
index 4b3a9e41e2d..b21639f3631 100644
--- a/backend/onyx/background/indexing/checkpointing_utils.py
+++ b/backend/onyx/background/indexing/checkpointing_utils.py
@@ -33,7 +33,7 @@ def save_checkpoint(
     """Save a checkpoint for a given index attempt to the file store"""
     checkpoint_pointer = _build_checkpoint_pointer(index_attempt_id)
 
-    file_store = get_default_file_store(db_session)
+    file_store = get_default_file_store()
     file_store.save_file(
         content=BytesIO(checkpoint.model_dump_json().encode()),
         display_name=checkpoint_pointer,
@@ -52,11 +52,11 @@ def save_checkpoint(
 
 
 def load_checkpoint(
-    db_session: Session, index_attempt_id: int, connector: BaseConnector
+    index_attempt_id: int, connector: BaseConnector
 ) -> ConnectorCheckpoint:
     """Load a checkpoint for a given index attempt from the file store"""
     checkpoint_pointer = _build_checkpoint_pointer(index_attempt_id)
-    file_store = get_default_file_store(db_session)
+    file_store = get_default_file_store()
     checkpoint_io = file_store.read_file(checkpoint_pointer, mode="rb")
     checkpoint_data = checkpoint_io.read().decode("utf-8")
     if isinstance(connector, CheckpointedConnector):
@@ -71,7 +71,7 @@ def get_latest_valid_checkpoint(
     window_start: datetime,
     window_end: datetime,
     connector: BaseConnector,
-) -> ConnectorCheckpoint:
+) -> tuple[ConnectorCheckpoint, bool]:
     """Get the latest valid checkpoint for a given connector credential pair"""
     checkpoint_candidates = get_recent_completed_attempts_for_cc_pair(
         cc_pair_id=cc_pair_id,
@@ -83,7 +83,7 @@ def get_latest_valid_checkpoint(
     # don't keep using checkpoints if we've had a bunch of failed attempts in a row
     # where we make no progress. Only do this if we have had at least
     # _NUM_RECENT_ATTEMPTS_TO_CONSIDER completed attempts.
-    if len(checkpoint_candidates) == _NUM_RECENT_ATTEMPTS_TO_CONSIDER:
+    if len(checkpoint_candidates) >= _NUM_RECENT_ATTEMPTS_TO_CONSIDER:
         had_any_progress = False
         for candidate in checkpoint_candidates:
             if (
@@ -99,7 +99,7 @@ def get_latest_valid_checkpoint(
                 f"found for cc_pair={cc_pair_id}. Ignoring checkpoint to let the run start "
                 "from scratch."
             )
-            return connector.build_dummy_checkpoint()
+            return connector.build_dummy_checkpoint(), False
 
     # filter out any candidates that don't meet the criteria
     checkpoint_candidates = [
@@ -140,11 +140,10 @@ def get_latest_valid_checkpoint(
         logger.info(
             f"No valid checkpoint found for cc_pair={cc_pair_id}. Starting from scratch."
         )
-        return checkpoint
+        return checkpoint, False
 
     try:
         previous_checkpoint = load_checkpoint(
-            db_session=db_session,
             index_attempt_id=latest_valid_checkpoint_candidate.id,
             connector=connector,
         )
@@ -153,14 +152,14 @@ def get_latest_valid_checkpoint(
             f"Failed to load checkpoint from previous failed attempt with ID "
             f"{latest_valid_checkpoint_candidate.id}. Falling back to default checkpoint."
         )
-        return checkpoint
+        return checkpoint, False
 
     logger.info(
         f"Using checkpoint from previous failed attempt with ID "
         f"{latest_valid_checkpoint_candidate.id}. Previous checkpoint: "
         f"{previous_checkpoint}"
     )
-    return previous_checkpoint
+    return previous_checkpoint, True
 
 
 def get_index_attempts_with_old_checkpoints(
@@ -201,7 +200,7 @@ def cleanup_checkpoint(db_session: Session, index_attempt_id: int) -> None:
     if not index_attempt.checkpoint_pointer:
         return None
 
-    file_store = get_default_file_store(db_session)
+    file_store = get_default_file_store()
     file_store.delete_file(index_attempt.checkpoint_pointer)
 
     index_attempt.checkpoint_pointer = None
diff --git a/backend/onyx/background/indexing/run_indexing.py b/backend/onyx/background/indexing/run_docfetching.py
similarity index 58%
rename from backend/onyx/background/indexing/run_indexing.py
rename to backend/onyx/background/indexing/run_docfetching.py
index 85479175435..10c37144f34 100644
--- a/backend/onyx/background/indexing/run_indexing.py
+++ b/backend/onyx/background/indexing/run_docfetching.py
@@ -1,3 +1,4 @@
+import sys
 import time
 import traceback
 from collections import defaultdict
@@ -5,7 +6,7 @@
 from datetime import timedelta
 from datetime import timezone
 
-from pydantic import BaseModel
+from celery import Celery
 from sqlalchemy.orm import Session
 
 from onyx.access.access import source_should_fetch_permissions_during_indexing
@@ -18,18 +19,25 @@
 from onyx.configs.app_configs import INDEXING_TRACER_INTERVAL
 from onyx.configs.app_configs import INTEGRATION_TESTS_MODE
 from onyx.configs.app_configs import LEAVE_CONNECTOR_ACTIVE_ON_INITIALIZATION_FAILURE
+from onyx.configs.app_configs import MAX_FILE_SIZE_BYTES
 from onyx.configs.app_configs import POLL_CONNECTOR_OFFSET
-from onyx.configs.constants import DocumentSource
 from onyx.configs.constants import MilestoneRecordType
+from onyx.configs.constants import OnyxCeleryPriority
+from onyx.configs.constants import OnyxCeleryQueues
+from onyx.configs.constants import OnyxCeleryTask
 from onyx.connectors.connector_runner import ConnectorRunner
 from onyx.connectors.exceptions import ConnectorValidationError
 from onyx.connectors.exceptions import UnexpectedValidationError
 from onyx.connectors.factory import instantiate_connector
+from onyx.connectors.interfaces import CheckpointedConnector
 from onyx.connectors.models import ConnectorFailure
+from onyx.connectors.models import ConnectorStopSignal
+from onyx.connectors.models import DocExtractionContext
 from onyx.connectors.models import Document
 from onyx.connectors.models import IndexAttemptMetadata
 from onyx.connectors.models import TextSection
 from onyx.db.connector import mark_cc_pair_as_permissions_synced
+from onyx.db.connector import mark_ccpair_with_indexing_trigger
 from onyx.db.connector_credential_pair import get_connector_credential_pair_from_id
 from onyx.db.connector_credential_pair import get_last_successful_attempt_poll_range_end
 from onyx.db.connector_credential_pair import update_connector_credential_pair
@@ -49,13 +57,16 @@
 from onyx.db.index_attempt import mark_attempt_succeeded
 from onyx.db.index_attempt import transition_attempt_to_in_progress
 from onyx.db.index_attempt import update_docs_indexed
+from onyx.db.indexing_coordination import IndexingCoordination
 from onyx.db.models import IndexAttempt
 from onyx.db.models import IndexAttemptError
 from onyx.document_index.factory import get_default_document_index
+from onyx.file_store.document_batch_storage import DocumentBatchStorage
+from onyx.file_store.document_batch_storage import get_document_batch_storage
 from onyx.httpx.httpx_pool import HttpxPool
 from onyx.indexing.embedder import DefaultIndexingEmbedder
 from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
-from onyx.indexing.indexing_pipeline import build_indexing_pipeline
+from onyx.indexing.indexing_pipeline import run_indexing_pipeline
 from onyx.natural_language_processing.search_nlp_models import (
     InformationContentClassificationModel,
 )
@@ -68,7 +79,7 @@
 from onyx.utils.variable_functionality import global_version
 from shared_configs.configs import MULTI_TENANT
 
-logger = setup_logger()
+logger = setup_logger(propagate=False)
 
 INDEXING_TRACER_NUM_PRINT_ENTRIES = 5
 
@@ -146,6 +157,10 @@ def _get_connector_runner(
 def strip_null_characters(doc_batch: list[Document]) -> list[Document]:
     cleaned_batch = []
     for doc in doc_batch:
+        if sys.getsizeof(doc) > MAX_FILE_SIZE_BYTES:
+            logger.warning(
+                f"doc {doc.id} too large, Document size: {sys.getsizeof(doc)}"
+            )
         cleaned_doc = doc.model_copy()
 
         # Postgres cannot handle NUL characters in text fields
@@ -180,25 +195,11 @@ def strip_null_characters(doc_batch: list[Document]) -> list[Document]:
     return cleaned_batch
 
 
-class ConnectorStopSignal(Exception):
-    """A custom exception used to signal a stop in processing."""
-
-
-class RunIndexingContext(BaseModel):
-    index_name: str
-    cc_pair_id: int
-    connector_id: int
-    credential_id: int
-    source: DocumentSource
-    earliest_index_time: float
-    from_beginning: bool
-    is_primary: bool
-    should_fetch_permissions_during_indexing: bool
-    search_settings_status: IndexModelStatus
-
-
 def _check_connector_and_attempt_status(
-    db_session_temp: Session, ctx: RunIndexingContext, index_attempt_id: int
+    db_session_temp: Session,
+    cc_pair_id: int,
+    search_settings_status: IndexModelStatus,
+    index_attempt_id: int,
 ) -> None:
     """
     Checks the status of the connector credential pair and index attempt.
@@ -206,27 +207,34 @@ def _check_connector_and_attempt_status(
     """
     cc_pair_loop = get_connector_credential_pair_from_id(
         db_session_temp,
-        ctx.cc_pair_id,
+        cc_pair_id,
     )
     if not cc_pair_loop:
-        raise RuntimeError(f"CC pair {ctx.cc_pair_id} not found in DB.")
+        raise RuntimeError(f"CC pair {cc_pair_id} not found in DB.")
 
     if (
         cc_pair_loop.status == ConnectorCredentialPairStatus.PAUSED
-        and ctx.search_settings_status != IndexModelStatus.FUTURE
+        and search_settings_status != IndexModelStatus.FUTURE
     ) or cc_pair_loop.status == ConnectorCredentialPairStatus.DELETING:
-        raise RuntimeError("Connector was disabled mid run")
+        raise ConnectorStopSignal(f"Connector {cc_pair_loop.status.value.lower()}")
 
     index_attempt_loop = get_index_attempt(db_session_temp, index_attempt_id)
     if not index_attempt_loop:
         raise RuntimeError(f"Index attempt {index_attempt_id} not found in DB.")
 
+    if index_attempt_loop.status == IndexingStatus.CANCELED:
+        raise ConnectorStopSignal(f"Index attempt {index_attempt_id} was canceled")
+
     if index_attempt_loop.status != IndexingStatus.IN_PROGRESS:
         raise RuntimeError(
-            f"Index Attempt was canceled, status is {index_attempt_loop.status}"
+            f"Index Attempt is not running, status is {index_attempt_loop.status}"
         )
 
+    if index_attempt_loop.celery_task_id is None:
+        raise RuntimeError(f"Index attempt {index_attempt_id} has no celery task id")
+
 
+# TODO: delete from here if ends up unused
 def _check_failure_threshold(
     total_failures: int,
     document_count: int,
@@ -257,6 +265,9 @@ def _check_failure_threshold(
         )
 
 
+# NOTE: this is the old run_indexing function that the new decoupled approach
+# is based on. Leaving this for comparison purposes, but if you see this comment
+# has been here for >1 month, please delete this function.
 def _run_indexing(
     db_session: Session,
     index_attempt_id: int,
@@ -271,7 +282,12 @@ def _run_indexing(
     start_time = time.monotonic()  # jsut used for logging
 
     with get_session_with_current_tenant() as db_session_temp:
-        index_attempt_start = get_index_attempt(db_session_temp, index_attempt_id)
+        index_attempt_start = get_index_attempt(
+            db_session_temp,
+            index_attempt_id,
+            eager_load_cc_pair=True,
+            eager_load_search_settings=True,
+        )
         if not index_attempt_start:
             raise ValueError(
                 f"Index attempt {index_attempt_id} does not exist in DB. This should not be possible."
@@ -292,7 +308,7 @@ def _run_indexing(
             index_attempt_start.connector_credential_pair.last_successful_index_time
             is not None
         )
-        ctx = RunIndexingContext(
+        ctx = DocExtractionContext(
             index_name=index_attempt_start.search_settings.index_name,
             cc_pair_id=index_attempt_start.connector_credential_pair.id,
             connector_id=db_connector.id,
@@ -317,6 +333,7 @@ def _run_indexing(
                 and (from_beginning or not has_successful_attempt)
             ),
             search_settings_status=index_attempt_start.search_settings.status,
+            doc_extraction_complete_batch_num=None,
         )
 
         last_successful_index_poll_range_end = (
@@ -384,19 +401,6 @@ def _run_indexing(
         httpx_client=HttpxPool.get("vespa"),
     )
 
-    indexing_pipeline = build_indexing_pipeline(
-        embedder=embedding_model,
-        information_content_classification_model=information_content_classification_model,
-        document_index=document_index,
-        ignore_time_skip=(
-            ctx.from_beginning
-            or (ctx.search_settings_status == IndexModelStatus.FUTURE)
-        ),
-        db_session=db_session,
-        tenant_id=tenant_id,
-        callback=callback,
-    )
-
     # Initialize memory tracer. NOTE: won't actually do anything if
     # `INDEXING_TRACER_INTERVAL` is 0.
     memory_tracer = MemoryTracer(interval=INDEXING_TRACER_INTERVAL)
@@ -416,7 +420,9 @@ def _run_indexing(
     index_attempt: IndexAttempt | None = None
     try:
         with get_session_with_current_tenant() as db_session_temp:
-            index_attempt = get_index_attempt(db_session_temp, index_attempt_id)
+            index_attempt = get_index_attempt(
+                db_session_temp, index_attempt_id, eager_load_cc_pair=True
+            )
             if not index_attempt:
                 raise RuntimeError(f"Index attempt {index_attempt_id} not found in DB.")
 
@@ -439,7 +445,7 @@ def _run_indexing(
             ):
                 checkpoint = connector_runner.connector.build_dummy_checkpoint()
             else:
-                checkpoint = get_latest_valid_checkpoint(
+                checkpoint, _ = get_latest_valid_checkpoint(
                     db_session=db_session_temp,
                     cc_pair_id=ctx.cc_pair_id,
                     search_settings_id=index_attempt.search_settings_id,
@@ -496,7 +502,10 @@ def _run_indexing(
                 with get_session_with_current_tenant() as db_session_temp:
                     # will exception if the connector/index attempt is marked as paused/failed
                     _check_connector_and_attempt_status(
-                        db_session_temp, ctx, index_attempt_id
+                        db_session_temp,
+                        ctx.cc_pair_id,
+                        ctx.search_settings_status,
+                        index_attempt_id,
                     )
 
                 # save record of any failures at the connector level
@@ -554,7 +563,16 @@ def _run_indexing(
                 index_attempt_md.batch_num = batch_num + 1  # use 1-index for this
 
                 # real work happens here!
-                index_pipeline_result = indexing_pipeline(
+                index_pipeline_result = run_indexing_pipeline(
+                    embedder=embedding_model,
+                    information_content_classification_model=information_content_classification_model,
+                    document_index=document_index,
+                    ignore_time_skip=(
+                        ctx.from_beginning
+                        or (ctx.search_settings_status == IndexModelStatus.FUTURE)
+                    ),
+                    db_session=db_session,
+                    tenant_id=tenant_id,
                     document_batch=doc_batch_cleaned,
                     index_attempt_metadata=index_attempt_md,
                 )
@@ -815,6 +833,7 @@ def _run_indexing(
 
 
 def run_indexing_entrypoint(
+    app: Celery,
     index_attempt_id: int,
     tenant_id: str,
     connector_credential_pair_id: int,
@@ -832,7 +851,6 @@ def run_indexing_entrypoint(
         index_attempt_id, connector_credential_pair_id
     )
     with get_session_with_current_tenant() as db_session:
-        # TODO: remove long running session entirely
         attempt = transition_attempt_to_in_progress(index_attempt_id, db_session)
 
         tenant_str = ""
@@ -846,18 +864,516 @@ def run_indexing_entrypoint(
         credential_id = attempt.connector_credential_pair.credential_id
 
     logger.info(
-        f"Indexing starting{tenant_str}: "
+        f"Docfetching starting{tenant_str}: "
         f"connector='{connector_name}' "
         f"config='{connector_config}' "
         f"credentials='{credential_id}'"
     )
 
-    with get_session_with_current_tenant() as db_session:
-        _run_indexing(db_session, index_attempt_id, tenant_id, callback)
+    connector_document_extraction(
+        app,
+        index_attempt_id,
+        attempt.connector_credential_pair_id,
+        attempt.search_settings_id,
+        tenant_id,
+        callback,
+    )
 
     logger.info(
-        f"Indexing finished{tenant_str}: "
+        f"Docfetching finished{tenant_str}: "
         f"connector='{connector_name}' "
         f"config='{connector_config}' "
         f"credentials='{credential_id}'"
     )
+
+
+def connector_document_extraction(
+    app: Celery,
+    index_attempt_id: int,
+    cc_pair_id: int,
+    search_settings_id: int,
+    tenant_id: str,
+    callback: IndexingHeartbeatInterface | None = None,
+) -> None:
+    """Extract documents from connector and queue them for indexing pipeline processing.
+
+    This is the first part of the split indexing process that runs the connector
+    and extracts documents, storing them in the filestore for later processing.
+    """
+
+    start_time = time.monotonic()
+
+    logger.info(
+        f"Document extraction starting: "
+        f"attempt={index_attempt_id} "
+        f"cc_pair={cc_pair_id} "
+        f"search_settings={search_settings_id} "
+        f"tenant={tenant_id}"
+    )
+
+    # Get batch storage (transition to IN_PROGRESS is handled by run_indexing_entrypoint)
+    batch_storage = get_document_batch_storage(cc_pair_id, index_attempt_id)
+
+    # Initialize memory tracer. NOTE: won't actually do anything if
+    # `INDEXING_TRACER_INTERVAL` is 0.
+    memory_tracer = MemoryTracer(interval=INDEXING_TRACER_INTERVAL)
+    memory_tracer.start()
+
+    index_attempt = None
+    last_batch_num = 0  # used to continue from checkpointing
+    # comes from _run_indexing
+    with get_session_with_current_tenant() as db_session:
+        index_attempt = get_index_attempt(
+            db_session,
+            index_attempt_id,
+            eager_load_cc_pair=True,
+            eager_load_search_settings=True,
+        )
+        if not index_attempt:
+            raise RuntimeError(f"Index attempt {index_attempt_id} not found")
+
+        if index_attempt.search_settings is None:
+            raise ValueError("Search settings must be set for indexing")
+
+        # Clear the indexing trigger if it was set, to prevent duplicate indexing attempts
+        if index_attempt.connector_credential_pair.indexing_trigger is not None:
+            logger.info(
+                "Clearing indexing trigger: "
+                f"cc_pair={index_attempt.connector_credential_pair.id} "
+                f"trigger={index_attempt.connector_credential_pair.indexing_trigger}"
+            )
+            mark_ccpair_with_indexing_trigger(
+                index_attempt.connector_credential_pair.id, None, db_session
+            )
+
+        db_connector = index_attempt.connector_credential_pair.connector
+        db_credential = index_attempt.connector_credential_pair.credential
+        is_primary = index_attempt.search_settings.status == IndexModelStatus.PRESENT
+        from_beginning = index_attempt.from_beginning
+        has_successful_attempt = (
+            index_attempt.connector_credential_pair.last_successful_index_time
+            is not None
+        )
+
+        earliest_index_time = (
+            db_connector.indexing_start.timestamp()
+            if db_connector.indexing_start
+            else 0
+        )
+        should_fetch_permissions_during_indexing = (
+            index_attempt.connector_credential_pair.access_type == AccessType.SYNC
+            and source_should_fetch_permissions_during_indexing(db_connector.source)
+            and is_primary
+            # if we've already successfully indexed, let the doc_sync job
+            # take care of doc-level permissions
+            and (from_beginning or not has_successful_attempt)
+        )
+
+        # Set up time windows for polling
+        last_successful_index_poll_range_end = (
+            earliest_index_time
+            if from_beginning
+            else get_last_successful_attempt_poll_range_end(
+                cc_pair_id=cc_pair_id,
+                earliest_index=earliest_index_time,
+                search_settings=index_attempt.search_settings,
+                db_session=db_session,
+            )
+        )
+
+        if last_successful_index_poll_range_end > POLL_CONNECTOR_OFFSET:
+            window_start = datetime.fromtimestamp(
+                last_successful_index_poll_range_end, tz=timezone.utc
+            ) - timedelta(minutes=POLL_CONNECTOR_OFFSET)
+        else:
+            # don't go into "negative" time if we've never indexed before
+            window_start = datetime.fromtimestamp(0, tz=timezone.utc)
+
+        most_recent_attempt = next(
+            iter(
+                get_recent_completed_attempts_for_cc_pair(
+                    cc_pair_id=cc_pair_id,
+                    search_settings_id=index_attempt.search_settings_id,
+                    db_session=db_session,
+                    limit=1,
+                )
+            ),
+            None,
+        )
+
+        # if the last attempt failed, try and use the same window. This is necessary
+        # to ensure correctness with checkpointing. If we don't do this, things like
+        # new slack channels could be missed (since existing slack channels are
+        # cached as part of the checkpoint).
+        if (
+            most_recent_attempt
+            and most_recent_attempt.poll_range_end
+            and (
+                most_recent_attempt.status == IndexingStatus.FAILED
+                or most_recent_attempt.status == IndexingStatus.CANCELED
+            )
+        ):
+            window_end = most_recent_attempt.poll_range_end
+        else:
+            window_end = datetime.now(tz=timezone.utc)
+
+        # set time range in db
+        index_attempt.poll_range_start = window_start
+        index_attempt.poll_range_end = window_end
+        db_session.commit()
+
+        # TODO: maybe memory tracer here
+
+        # Set up connector runner
+        connector_runner = _get_connector_runner(
+            db_session=db_session,
+            attempt=index_attempt,
+            batch_size=INDEX_BATCH_SIZE,
+            start_time=window_start,
+            end_time=window_end,
+            include_permissions=should_fetch_permissions_during_indexing,
+        )
+
+        # don't use a checkpoint if we're explicitly indexing from
+        # the beginning in order to avoid weird interactions between
+        # checkpointing / failure handling
+        # OR
+        # if the last attempt was successful
+        if index_attempt.from_beginning or (
+            most_recent_attempt and most_recent_attempt.status.is_successful()
+        ):
+            logger.info(
+                f"Cleaning up all old batches for index attempt {index_attempt_id} before starting new run"
+            )
+            batch_storage.cleanup_all_batches()
+            checkpoint = connector_runner.connector.build_dummy_checkpoint()
+        else:
+            logger.info(
+                f"Getting latest valid checkpoint for index attempt {index_attempt_id}"
+            )
+            checkpoint, resuming_from_checkpoint = get_latest_valid_checkpoint(
+                db_session=db_session,
+                cc_pair_id=cc_pair_id,
+                search_settings_id=index_attempt.search_settings_id,
+                window_start=window_start,
+                window_end=window_end,
+                connector=connector_runner.connector,
+            )
+
+            if (
+                isinstance(connector_runner.connector, CheckpointedConnector)
+                and resuming_from_checkpoint
+            ):
+                reissued_batch_count, completed_batches = reissue_old_batches(
+                    batch_storage,
+                    index_attempt_id,
+                    cc_pair_id,
+                    tenant_id,
+                    app,
+                    most_recent_attempt,
+                )
+                last_batch_num = reissued_batch_count + completed_batches
+                index_attempt.completed_batches = completed_batches
+                db_session.commit()
+            else:
+                logger.info(
+                    f"Cleaning up all batches for index attempt {index_attempt_id} before starting new run"
+                )
+                # for non-checkpointed connectors, throw out batches from previous unsuccessful attempts
+                # because we'll be getting those documents again anyways.
+                batch_storage.cleanup_all_batches()
+
+        # Save initial checkpoint
+        save_checkpoint(
+            db_session=db_session,
+            index_attempt_id=index_attempt_id,
+            checkpoint=checkpoint,
+        )
+
+    try:
+        batch_num = last_batch_num  # starts at 0 if no last batch
+        total_doc_batches_queued = 0
+        total_failures = 0
+        document_count = 0
+
+        # Main extraction loop
+        while checkpoint.has_more:
+            logger.info(
+                f"Running '{db_connector.source.value}' connector with checkpoint: {checkpoint}"
+            )
+            for document_batch, failure, next_checkpoint in connector_runner.run(
+                checkpoint
+            ):
+                # Check if connector is disabled mid run and stop if so unless it's the secondary
+                # index being built. We want to populate it even for paused connectors
+                # Often paused connectors are sources that aren't updated frequently but the
+                # contents still need to be initially pulled.
+                if callback:
+                    if callback.should_stop():
+                        raise ConnectorStopSignal("Connector stop signal detected")
+
+                    # NOTE: this progress callback runs on every loop. We've seen cases
+                    # where we loop many times with no new documents and eventually time
+                    # out, so only doing the callback after indexing isn't sufficient.
+                    # TODO: change to doc extraction if it doesnt break things
+                    callback.progress("_run_indexing", 0)
+
+                # will exception if the connector/index attempt is marked as paused/failed
+                with get_session_with_current_tenant() as db_session_tmp:
+                    _check_connector_and_attempt_status(
+                        db_session_tmp,
+                        cc_pair_id,
+                        index_attempt.search_settings.status,
+                        index_attempt_id,
+                    )
+
+                # save record of any failures at the connector level
+                if failure is not None:
+                    total_failures += 1
+                    with get_session_with_current_tenant() as db_session:
+                        create_index_attempt_error(
+                            index_attempt_id,
+                            cc_pair_id,
+                            failure,
+                            db_session,
+                        )
+                    _check_failure_threshold(
+                        total_failures, document_count, batch_num, failure
+                    )
+
+                # Save checkpoint if provided
+                if next_checkpoint:
+                    checkpoint = next_checkpoint
+
+                # below is all document processing task, so if no batch we can just continue
+                if document_batch is None:
+                    continue
+
+                # Clean documents and create batch
+                doc_batch_cleaned = strip_null_characters(document_batch)
+                batch_description = []
+
+                for doc in doc_batch_cleaned:
+                    batch_description.append(doc.to_short_descriptor())
+
+                    doc_size = 0
+                    for section in doc.sections:
+                        if (
+                            isinstance(section, TextSection)
+                            and section.text is not None
+                        ):
+                            doc_size += len(section.text)
+
+                    if doc_size > INDEXING_SIZE_WARNING_THRESHOLD:
+                        logger.warning(
+                            f"Document size: doc='{doc.to_short_descriptor()}' "
+                            f"size={doc_size} "
+                            f"threshold={INDEXING_SIZE_WARNING_THRESHOLD}"
+                        )
+
+                logger.debug(f"Indexing batch of documents: {batch_description}")
+                memory_tracer.increment_and_maybe_trace()
+
+                # Store documents in storage
+                batch_storage.store_batch(batch_num, doc_batch_cleaned)
+
+                # Create processing task data
+                processing_batch_data = {
+                    "index_attempt_id": index_attempt_id,
+                    "cc_pair_id": cc_pair_id,
+                    "tenant_id": tenant_id,
+                    "batch_num": batch_num,  # 0-indexed
+                }
+
+                # Queue document processing task
+                app.send_task(
+                    OnyxCeleryTask.DOCPROCESSING_TASK,
+                    kwargs=processing_batch_data,
+                    queue=OnyxCeleryQueues.DOCPROCESSING,
+                    priority=OnyxCeleryPriority.MEDIUM,
+                )
+
+                batch_num += 1
+                total_doc_batches_queued += 1
+
+                logger.info(
+                    f"Queued document processing batch: "
+                    f"batch_num={batch_num} "
+                    f"docs={len(doc_batch_cleaned)} "
+                    f"attempt={index_attempt_id}"
+                )
+
+            # Check checkpoint size periodically
+            CHECKPOINT_SIZE_CHECK_INTERVAL = 100
+            if batch_num % CHECKPOINT_SIZE_CHECK_INTERVAL == 0:
+                check_checkpoint_size(checkpoint)
+
+            # Save latest checkpoint
+            # NOTE: checkpointing is used to track which batches have
+            # been sent to the filestore, NOT which batches have been fully indexed
+            # as it used to be.
+            with get_session_with_current_tenant() as db_session:
+                save_checkpoint(
+                    db_session=db_session,
+                    index_attempt_id=index_attempt_id,
+                    checkpoint=checkpoint,
+                )
+
+        elapsed_time = time.monotonic() - start_time
+
+        logger.info(
+            f"Document extraction completed: "
+            f"attempt={index_attempt_id} "
+            f"batches_queued={total_doc_batches_queued} "
+            f"elapsed={elapsed_time:.2f}s"
+        )
+
+        # Set total batches in database to signal extraction completion.
+        # Used by check_for_indexing to determine if the index attempt is complete.
+        with get_session_with_current_tenant() as db_session:
+            IndexingCoordination.set_total_batches(
+                db_session=db_session,
+                index_attempt_id=index_attempt_id,
+                total_batches=batch_num,
+            )
+
+    except Exception as e:
+        logger.exception(
+            f"Document extraction failed: "
+            f"attempt={index_attempt_id} "
+            f"error={str(e)}"
+        )
+
+        # Do NOT clean up batches on failure; future runs will use those batches
+        # while docfetching will continue from the saved checkpoint if one exists
+
+        if isinstance(e, ConnectorValidationError):
+            # On validation errors during indexing, we want to cancel the indexing attempt
+            # and mark the CCPair as invalid. This prevents the connector from being
+            # used in the future until the credentials are updated.
+            with get_session_with_current_tenant() as db_session_temp:
+                logger.exception(
+                    f"Marking attempt {index_attempt_id} as canceled due to validation error."
+                )
+                mark_attempt_canceled(
+                    index_attempt_id,
+                    db_session_temp,
+                    reason=f"{CONNECTOR_VALIDATION_ERROR_MESSAGE_PREFIX}{str(e)}",
+                )
+
+                if is_primary:
+                    if not index_attempt:
+                        # should always be set by now
+                        raise RuntimeError("Should never happen.")
+
+                    VALIDATION_ERROR_THRESHOLD = 5
+
+                    recent_index_attempts = get_recent_completed_attempts_for_cc_pair(
+                        cc_pair_id=cc_pair_id,
+                        search_settings_id=index_attempt.search_settings_id,
+                        limit=VALIDATION_ERROR_THRESHOLD,
+                        db_session=db_session_temp,
+                    )
+                    num_validation_errors = len(
+                        [
+                            index_attempt
+                            for index_attempt in recent_index_attempts
+                            if index_attempt.error_msg
+                            and index_attempt.error_msg.startswith(
+                                CONNECTOR_VALIDATION_ERROR_MESSAGE_PREFIX
+                            )
+                        ]
+                    )
+
+                    if num_validation_errors >= VALIDATION_ERROR_THRESHOLD:
+                        logger.warning(
+                            f"Connector {db_connector.id} has {num_validation_errors} consecutive validation"
+                            f" errors. Marking the CC Pair as invalid."
+                        )
+                        update_connector_credential_pair(
+                            db_session=db_session_temp,
+                            connector_id=db_connector.id,
+                            credential_id=db_credential.id,
+                            status=ConnectorCredentialPairStatus.INVALID,
+                        )
+            raise e
+        elif isinstance(e, ConnectorStopSignal):
+            with get_session_with_current_tenant() as db_session_temp:
+                logger.exception(
+                    f"Marking attempt {index_attempt_id} as canceled due to stop signal."
+                )
+                mark_attempt_canceled(
+                    index_attempt_id,
+                    db_session_temp,
+                    reason=str(e),
+                )
+
+        else:
+            with get_session_with_current_tenant() as db_session_temp:
+                # don't overwrite attempts that are already failed/canceled for another reason
+                index_attempt = get_index_attempt(db_session_temp, index_attempt_id)
+                if index_attempt and index_attempt.status in [
+                    IndexingStatus.CANCELED,
+                    IndexingStatus.FAILED,
+                ]:
+                    logger.info(
+                        f"Attempt {index_attempt_id} is already failed/canceled, skipping marking as failed."
+                    )
+                    raise e
+
+                mark_attempt_failed(
+                    index_attempt_id,
+                    db_session_temp,
+                    failure_reason=str(e),
+                    full_exception_trace=traceback.format_exc(),
+                )
+
+            raise e
+
+    finally:
+        memory_tracer.stop()
+
+
+def reissue_old_batches(
+    batch_storage: DocumentBatchStorage,
+    index_attempt_id: int,
+    cc_pair_id: int,
+    tenant_id: str,
+    app: Celery,
+    most_recent_attempt: IndexAttempt | None,
+) -> tuple[int, int]:
+    # When loading from a checkpoint, we need to start new docprocessing tasks
+    # tied to the new index attempt for any batches left over in the file store
+    old_batches = batch_storage.get_all_batches_for_cc_pair()
+    batch_storage.update_old_batches_to_new_index_attempt(old_batches)
+    for batch_id in old_batches:
+        logger.info(
+            f"Re-issuing docprocessing task for batch {batch_id} for index attempt {index_attempt_id}"
+        )
+        path_info = batch_storage.extract_path_info(batch_id)
+        if path_info is None:
+            continue
+        if path_info.cc_pair_id != cc_pair_id:
+            raise RuntimeError(f"Batch {batch_id} is not for cc pair {cc_pair_id}")
+
+        app.send_task(
+            OnyxCeleryTask.DOCPROCESSING_TASK,
+            kwargs={
+                "index_attempt_id": index_attempt_id,
+                "cc_pair_id": cc_pair_id,
+                "tenant_id": tenant_id,
+                "batch_num": path_info.batch_num,  # use same batch num as previously
+            },
+            queue=OnyxCeleryQueues.DOCPROCESSING,
+            priority=OnyxCeleryPriority.MEDIUM,
+        )
+    recent_batches = most_recent_attempt.completed_batches if most_recent_attempt else 0
+    # resume from the batch num of the last attempt. This should be one more
+    # than the last batch created by docfetching regardless of whether the batch
+    # is still in the filestore waiting for processing or not.
+    last_batch_num = len(old_batches) + recent_batches
+    logger.info(
+        f"Starting from batch {last_batch_num} due to "
+        f"re-issued batches: {old_batches}, completed batches: {recent_batches}"
+    )
+    return len(old_batches), recent_batches
diff --git a/backend/onyx/chat/process_message.py b/backend/onyx/chat/process_message.py
index dbb04d3a963..717dd54564b 100644
--- a/backend/onyx/chat/process_message.py
+++ b/backend/onyx/chat/process_message.py
@@ -725,9 +725,7 @@ def stream_chat_message_objects(
                     )
 
         # load all files needed for this chat chain in memory
-        files = load_all_chat_files(
-            history_msgs, new_msg_req.file_descriptors, db_session
-        )
+        files = load_all_chat_files(history_msgs, new_msg_req.file_descriptors)
         req_file_ids = [f["id"] for f in new_msg_req.file_descriptors]
         latest_query_files = [file for file in files if file.file_id in req_file_ids]
         user_file_ids = new_msg_req.user_file_ids or []
diff --git a/backend/onyx/configs/app_configs.py b/backend/onyx/configs/app_configs.py
index 650ae61fb87..99a84f4a6d6 100644
--- a/backend/onyx/configs/app_configs.py
+++ b/backend/onyx/configs/app_configs.py
@@ -314,18 +314,33 @@
         CELERY_WORKER_LIGHT_PREFETCH_MULTIPLIER_DEFAULT
     )
 
-CELERY_WORKER_INDEXING_CONCURRENCY_DEFAULT = 3
+CELERY_WORKER_DOCPROCESSING_CONCURRENCY_DEFAULT = 6
 try:
-    env_value = os.environ.get("CELERY_WORKER_INDEXING_CONCURRENCY")
+    env_value = os.environ.get("CELERY_WORKER_DOCPROCESSING_CONCURRENCY")
     if not env_value:
         env_value = os.environ.get("NUM_INDEXING_WORKERS")
 
     if not env_value:
-        env_value = str(CELERY_WORKER_INDEXING_CONCURRENCY_DEFAULT)
-    CELERY_WORKER_INDEXING_CONCURRENCY = int(env_value)
+        env_value = str(CELERY_WORKER_DOCPROCESSING_CONCURRENCY_DEFAULT)
+    CELERY_WORKER_DOCPROCESSING_CONCURRENCY = int(env_value)
 except ValueError:
-    CELERY_WORKER_INDEXING_CONCURRENCY = CELERY_WORKER_INDEXING_CONCURRENCY_DEFAULT
+    CELERY_WORKER_DOCPROCESSING_CONCURRENCY = (
+        CELERY_WORKER_DOCPROCESSING_CONCURRENCY_DEFAULT
+    )
+
+CELERY_WORKER_DOCFETCHING_CONCURRENCY_DEFAULT = 1
+try:
+    env_value = os.environ.get("CELERY_WORKER_DOCFETCHING_CONCURRENCY")
+    if not env_value:
+        env_value = os.environ.get("NUM_DOCFETCHING_WORKERS")
 
+    if not env_value:
+        env_value = str(CELERY_WORKER_DOCFETCHING_CONCURRENCY_DEFAULT)
+    CELERY_WORKER_DOCFETCHING_CONCURRENCY = int(env_value)
+except ValueError:
+    CELERY_WORKER_DOCFETCHING_CONCURRENCY = (
+        CELERY_WORKER_DOCFETCHING_CONCURRENCY_DEFAULT
+    )
 
 CELERY_WORKER_KG_PROCESSING_CONCURRENCY = int(
     os.environ.get("CELERY_WORKER_KG_PROCESSING_CONCURRENCY") or 4
diff --git a/backend/onyx/configs/constants.py b/backend/onyx/configs/constants.py
index 6f1a8f8157c..77190ee6468 100644
--- a/backend/onyx/configs/constants.py
+++ b/backend/onyx/configs/constants.py
@@ -65,7 +65,8 @@
 POSTGRES_CELERY_WORKER_PRIMARY_APP_NAME = "celery_worker_primary"
 POSTGRES_CELERY_WORKER_LIGHT_APP_NAME = "celery_worker_light"
 POSTGRES_CELERY_WORKER_HEAVY_APP_NAME = "celery_worker_heavy"
-POSTGRES_CELERY_WORKER_INDEXING_APP_NAME = "celery_worker_indexing"
+POSTGRES_CELERY_WORKER_DOCPROCESSING_APP_NAME = "celery_worker_docprocessing"
+POSTGRES_CELERY_WORKER_DOCFETCHING_APP_NAME = "celery_worker_docfetching"
 POSTGRES_CELERY_WORKER_MONITORING_APP_NAME = "celery_worker_monitoring"
 POSTGRES_CELERY_WORKER_INDEXING_CHILD_APP_NAME = "celery_worker_indexing_child"
 POSTGRES_CELERY_WORKER_KG_PROCESSING_APP_NAME = "celery_worker_kg_processing"
@@ -121,6 +122,8 @@
 # hard termination should always fire first if the connector is hung
 CELERY_INDEXING_LOCK_TIMEOUT = CELERY_INDEXING_WATCHDOG_CONNECTOR_TIMEOUT + 900
 
+# Heartbeat interval for indexing worker liveness detection
+INDEXING_WORKER_HEARTBEAT_INTERVAL = 30  # seconds
 
 # how long a task should wait for associated fence to be ready
 CELERY_TASK_WAIT_FOR_FENCE_TIMEOUT = 5 * 60  # 5 min
@@ -328,9 +331,12 @@ class OnyxCeleryQueues:
     CSV_GENERATION = "csv_generation"
 
     # Indexing queue
-    CONNECTOR_INDEXING = "connector_indexing"
     USER_FILES_INDEXING = "user_files_indexing"
 
+    # Document processing pipeline queue
+    DOCPROCESSING = "docprocessing"
+    CONNECTOR_DOC_FETCHING = "connector_doc_fetching"
+
     # Monitoring queue
     MONITORING = "monitoring"
 
@@ -461,7 +467,11 @@ class OnyxCeleryTask:
     CONNECTOR_EXTERNAL_GROUP_SYNC_GENERATOR_TASK = (
         "connector_external_group_sync_generator_task"
     )
-    CONNECTOR_INDEXING_PROXY_TASK = "connector_indexing_proxy_task"
+
+    # New split indexing tasks
+    CONNECTOR_DOC_FETCHING_TASK = "connector_doc_fetching_task"
+    DOCPROCESSING_TASK = "docprocessing_task"
+
     CONNECTOR_PRUNING_GENERATOR_TASK = "connector_pruning_generator_task"
     DOCUMENT_BY_CC_PAIR_CLEANUP_TASK = "document_by_cc_pair_cleanup_task"
     VESPA_METADATA_SYNC_TASK = "vespa_metadata_sync_task"
diff --git a/backend/onyx/connectors/blob/connector.py b/backend/onyx/connectors/blob/connector.py
index bdaa2d196db..256b07e6e3c 100644
--- a/backend/onyx/connectors/blob/connector.py
+++ b/backend/onyx/connectors/blob/connector.py
@@ -34,7 +34,6 @@
 from onyx.connectors.models import Document
 from onyx.connectors.models import ImageSection
 from onyx.connectors.models import TextSection
-from onyx.db.engine.sql_engine import get_session_with_current_tenant
 from onyx.file_processing.extract_file_text import extract_text_and_images
 from onyx.file_processing.extract_file_text import get_file_ext
 from onyx.file_processing.extract_file_text import is_accepted_file_ext
@@ -281,30 +280,28 @@ def _yield_blob_objects(
 
                         # TODO: Refactor to avoid direct DB access in connector
                         # This will require broader refactoring across the codebase
-                        with get_session_with_current_tenant() as db_session:
-                            image_section, _ = store_image_and_create_section(
-                                db_session=db_session,
-                                image_data=downloaded_file,
-                                file_id=f"{self.bucket_type}_{self.bucket_name}_{key.replace('/', '_')}",
-                                display_name=file_name,
-                                link=link,
-                                file_origin=FileOrigin.CONNECTOR,
-                            )
+                        image_section, _ = store_image_and_create_section(
+                            image_data=downloaded_file,
+                            file_id=f"{self.bucket_type}_{self.bucket_name}_{key.replace('/', '_')}",
+                            display_name=file_name,
+                            link=link,
+                            file_origin=FileOrigin.CONNECTOR,
+                        )
 
-                            batch.append(
-                                Document(
-                                    id=f"{self.bucket_type}:{self.bucket_name}:{key}",
-                                    sections=[image_section],
-                                    source=DocumentSource(self.bucket_type.value),
-                                    semantic_identifier=file_name,
-                                    doc_updated_at=last_modified,
-                                    metadata={},
-                                )
+                        batch.append(
+                            Document(
+                                id=f"{self.bucket_type}:{self.bucket_name}:{key}",
+                                sections=[image_section],
+                                source=DocumentSource(self.bucket_type.value),
+                                semantic_identifier=file_name,
+                                doc_updated_at=last_modified,
+                                metadata={},
                             )
+                        )
 
-                            if len(batch) == self.batch_size:
-                                yield batch
-                                batch = []
+                        if len(batch) == self.batch_size:
+                            yield batch
+                            batch = []
                     except Exception:
                         logger.exception(f"Error processing image {key}")
                     continue
diff --git a/backend/onyx/connectors/confluence/utils.py b/backend/onyx/connectors/confluence/utils.py
index eeb86280306..ef0848968ab 100644
--- a/backend/onyx/connectors/confluence/utils.py
+++ b/backend/onyx/connectors/confluence/utils.py
@@ -23,7 +23,6 @@
 )
 from onyx.configs.app_configs import CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD
 from onyx.configs.constants import FileOrigin
-from onyx.db.engine.sql_engine import get_session_with_current_tenant
 from onyx.file_processing.extract_file_text import extract_file_text
 from onyx.file_processing.extract_file_text import is_accepted_file_ext
 from onyx.file_processing.extract_file_text import OnyxExtensionType
@@ -224,19 +223,17 @@ def _process_image_attachment(
     """Process an image attachment by saving it without generating a summary."""
     try:
         # Use the standardized image storage and section creation
-        with get_session_with_current_tenant() as db_session:
-            section, file_name = store_image_and_create_section(
-                db_session=db_session,
-                image_data=raw_bytes,
-                file_id=Path(attachment["id"]).name,
-                display_name=attachment["title"],
-                media_type=media_type,
-                file_origin=FileOrigin.CONNECTOR,
-            )
-            logger.info(f"Stored image attachment with file name: {file_name}")
+        section, file_name = store_image_and_create_section(
+            image_data=raw_bytes,
+            file_id=Path(attachment["id"]).name,
+            display_name=attachment["title"],
+            media_type=media_type,
+            file_origin=FileOrigin.CONNECTOR,
+        )
+        logger.info(f"Stored image attachment with file name: {file_name}")
 
-            # Return empty text but include the file_name for later processing
-            return AttachmentProcessingResult(text="", file_name=file_name, error=None)
+        # Return empty text but include the file_name for later processing
+        return AttachmentProcessingResult(text="", file_name=file_name, error=None)
     except Exception as e:
         msg = f"Image storage failed for {attachment['title']}: {e}"
         logger.error(msg, exc_info=e)
diff --git a/backend/onyx/connectors/file/connector.py b/backend/onyx/connectors/file/connector.py
index 24a5d18e83d..132b4ecc48a 100644
--- a/backend/onyx/connectors/file/connector.py
+++ b/backend/onyx/connectors/file/connector.py
@@ -5,8 +5,6 @@
 from typing import Any
 from typing import IO
 
-from sqlalchemy.orm import Session
-
 from onyx.configs.app_configs import INDEX_BATCH_SIZE
 from onyx.configs.constants import DocumentSource
 from onyx.configs.constants import FileOrigin
@@ -18,7 +16,6 @@
 from onyx.connectors.models import Document
 from onyx.connectors.models import ImageSection
 from onyx.connectors.models import TextSection
-from onyx.db.engine.sql_engine import get_session_with_current_tenant
 from onyx.file_processing.extract_file_text import extract_text_and_images
 from onyx.file_processing.extract_file_text import get_file_ext
 from onyx.file_processing.extract_file_text import is_accepted_file_ext
@@ -32,7 +29,6 @@
 
 def _create_image_section(
     image_data: bytes,
-    db_session: Session,
     parent_file_name: str,
     display_name: str,
     link: str | None = None,
@@ -58,7 +54,6 @@ def _create_image_section(
     # Store the image and create a section
     try:
         section, stored_file_name = store_image_and_create_section(
-            db_session=db_session,
             image_data=image_data,
             file_id=file_id,
             display_name=display_name,
@@ -77,7 +72,6 @@ def _process_file(
     file: IO[Any],
     metadata: dict[str, Any] | None,
     pdf_pass: str | None,
-    db_session: Session,
 ) -> list[Document]:
     """
     Process a file and return a list of Documents.
@@ -125,7 +119,6 @@ def _process_file(
         try:
             section, _ = _create_image_section(
                 image_data=image_data,
-                db_session=db_session,
                 parent_file_name=file_id,
                 display_name=title,
             )
@@ -196,7 +189,6 @@ def _process_file(
         try:
             image_section, stored_file_name = _create_image_section(
                 image_data=img_data,
-                db_session=db_session,
                 parent_file_name=file_id,
                 display_name=f"{title} - image {idx}",
                 idx=idx,
@@ -260,36 +252,32 @@ def load_from_state(self) -> GenerateDocumentsOutput:
         """
         documents: list[Document] = []
 
-        with get_session_with_current_tenant() as db_session:
-            for file_id in self.file_locations:
-                file_store = get_default_file_store(db_session)
-                file_record = file_store.read_file_record(file_id=file_id)
-                if not file_record:
-                    # typically an unsupported extension
-                    logger.warning(
-                        f"No file record found for '{file_id}' in PG; skipping."
-                    )
-                    continue
-
-                metadata = self._get_file_metadata(file_id)
-                file_io = file_store.read_file(file_id=file_id, mode="b")
-                new_docs = _process_file(
-                    file_id=file_id,
-                    file_name=file_record.display_name,
-                    file=file_io,
-                    metadata=metadata,
-                    pdf_pass=self.pdf_pass,
-                    db_session=db_session,
-                )
-                documents.extend(new_docs)
+        for file_id in self.file_locations:
+            file_store = get_default_file_store()
+            file_record = file_store.read_file_record(file_id=file_id)
+            if not file_record:
+                # typically an unsupported extension
+                logger.warning(f"No file record found for '{file_id}' in PG; skipping.")
+                continue
+
+            metadata = self._get_file_metadata(file_id)
+            file_io = file_store.read_file(file_id=file_id, mode="b")
+            new_docs = _process_file(
+                file_id=file_id,
+                file_name=file_record.display_name,
+                file=file_io,
+                metadata=metadata,
+                pdf_pass=self.pdf_pass,
+            )
+            documents.extend(new_docs)
 
-                if len(documents) >= self.batch_size:
-                    yield documents
+            if len(documents) >= self.batch_size:
+                yield documents
 
-                    documents = []
+                documents = []
 
-            if documents:
-                yield documents
+        if documents:
+            yield documents
 
 
 if __name__ == "__main__":
diff --git a/backend/onyx/connectors/google_drive/connector.py b/backend/onyx/connectors/google_drive/connector.py
index 53e233258f6..135a4107df2 100644
--- a/backend/onyx/connectors/google_drive/connector.py
+++ b/backend/onyx/connectors/google_drive/connector.py
@@ -1,6 +1,10 @@
 import copy
+import json
+import os
+import sys
 import threading
 from collections.abc import Callable
+from collections.abc import Generator
 from collections.abc import Iterator
 from datetime import datetime
 from enum import Enum
@@ -1374,3 +1378,139 @@ def build_dummy_checkpoint(self) -> GoogleDriveCheckpoint:
     @override
     def validate_checkpoint_json(self, checkpoint_json: str) -> GoogleDriveCheckpoint:
         return GoogleDriveCheckpoint.model_validate_json(checkpoint_json)
+
+
+def get_credentials_from_env(email: str, oauth: bool) -> dict:
+    if oauth:
+        raw_credential_string = os.environ["GOOGLE_DRIVE_OAUTH_CREDENTIALS_JSON_STR"]
+    else:
+        raw_credential_string = os.environ["GOOGLE_DRIVE_SERVICE_ACCOUNT_JSON_STR"]
+
+    refried_credential_string = json.dumps(json.loads(raw_credential_string))
+
+    # This is the Oauth token
+    DB_CREDENTIALS_DICT_TOKEN_KEY = "google_tokens"
+    # This is the service account key
+    DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY = "google_service_account_key"
+    # The email saved for both auth types
+    DB_CREDENTIALS_PRIMARY_ADMIN_KEY = "google_primary_admin"
+    DB_CREDENTIALS_AUTHENTICATION_METHOD = "authentication_method"
+    cred_key = (
+        DB_CREDENTIALS_DICT_TOKEN_KEY
+        if oauth
+        else DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY
+    )
+    return {
+        cred_key: refried_credential_string,
+        DB_CREDENTIALS_PRIMARY_ADMIN_KEY: email,
+        DB_CREDENTIALS_AUTHENTICATION_METHOD: "uploaded",
+    }
+
+
+class CheckpointOutputWrapper:
+    """
+    Wraps a CheckpointOutput generator to give things back in a more digestible format.
+    The connector format is easier for the connector implementor (e.g. it enforces exactly
+    one new checkpoint is returned AND that the checkpoint is at the end), thus the different
+    formats.
+    """
+
+    def __init__(self) -> None:
+        self.next_checkpoint: GoogleDriveCheckpoint | None = None
+
+    def __call__(
+        self,
+        checkpoint_connector_generator: CheckpointOutput[GoogleDriveCheckpoint],
+    ) -> Generator[
+        tuple[Document | None, ConnectorFailure | None, GoogleDriveCheckpoint | None],
+        None,
+        None,
+    ]:
+        # grabs the final return value and stores it in the `next_checkpoint` variable
+        def _inner_wrapper(
+            checkpoint_connector_generator: CheckpointOutput[GoogleDriveCheckpoint],
+        ) -> CheckpointOutput[GoogleDriveCheckpoint]:
+            self.next_checkpoint = yield from checkpoint_connector_generator
+            return self.next_checkpoint  # not used
+
+        for document_or_failure in _inner_wrapper(checkpoint_connector_generator):
+            if isinstance(document_or_failure, Document):
+                yield document_or_failure, None, None
+            elif isinstance(document_or_failure, ConnectorFailure):
+                yield None, document_or_failure, None
+            else:
+                raise ValueError(
+                    f"Invalid document_or_failure type: {type(document_or_failure)}"
+                )
+
+        if self.next_checkpoint is None:
+            raise RuntimeError(
+                "Checkpoint is None. This should never happen - the connector should always return a checkpoint."
+            )
+
+        yield None, None, self.next_checkpoint
+
+
+def yield_all_docs_from_checkpoint_connector(
+    connector: GoogleDriveConnector,
+    start: SecondsSinceUnixEpoch,
+    end: SecondsSinceUnixEpoch,
+) -> Iterator[Document | ConnectorFailure]:
+    num_iterations = 0
+
+    checkpoint = connector.build_dummy_checkpoint()
+    while checkpoint.has_more:
+        doc_batch_generator = CheckpointOutputWrapper()(
+            connector.load_from_checkpoint(start, end, checkpoint)
+        )
+        for document, failure, next_checkpoint in doc_batch_generator:
+            if failure is not None:
+                yield failure
+            if document is not None:
+                yield document
+            if next_checkpoint is not None:
+                checkpoint = next_checkpoint
+
+        num_iterations += 1
+        if num_iterations > 100_000:
+            raise RuntimeError("Too many iterations. Infinite loop?")
+
+
+if __name__ == "__main__":
+    import time
+
+    creds = get_credentials_from_env(
+        os.environ["GOOGLE_DRIVE_PRIMARY_ADMIN_EMAIL"], False
+    )
+    connector = GoogleDriveConnector(
+        include_shared_drives=True,
+        shared_drive_urls=None,
+        include_my_drives=True,
+        my_drive_emails=None,
+        shared_folder_urls=None,
+        include_files_shared_with_me=True,
+        specific_user_emails=None,
+    )
+    connector.load_credentials(creds)
+    max_fsize = 0
+    biggest_fsize = 0
+    num_errors = 0
+    start_time = time.time()
+    with open("stats.txt", "w") as f:
+        for num, doc_or_failure in enumerate(
+            yield_all_docs_from_checkpoint_connector(connector, 0, time.time())
+        ):
+            if num % 200 == 0:
+                f.write(f"Processed {num} files\n")
+                f.write(f"Max file size: {max_fsize/1000_000:.2f} MB\n")
+                f.write(f"Time so far: {time.time() - start_time:.2f} seconds\n")
+                f.write(f"Docs per minute: {num/(time.time() - start_time)*60:.2f}\n")
+                biggest_fsize = max(biggest_fsize, max_fsize)
+                max_fsize = 0
+            if isinstance(doc_or_failure, Document):
+                max_fsize = max(max_fsize, sys.getsizeof(doc_or_failure))
+            elif isinstance(doc_or_failure, ConnectorFailure):
+                num_errors += 1
+        print(f"Num errors: {num_errors}")
+        print(f"Biggest file size: {biggest_fsize/1000_000:.2f} MB")
+        print(f"Time taken: {time.time() - start_time:.2f} seconds")
diff --git a/backend/onyx/connectors/google_drive/doc_conversion.py b/backend/onyx/connectors/google_drive/doc_conversion.py
index a038ea0eace..8ee421dd31f 100644
--- a/backend/onyx/connectors/google_drive/doc_conversion.py
+++ b/backend/onyx/connectors/google_drive/doc_conversion.py
@@ -29,7 +29,6 @@
 from onyx.connectors.models import ImageSection
 from onyx.connectors.models import SlimDocument
 from onyx.connectors.models import TextSection
-from onyx.db.engine.sql_engine import get_session_with_current_tenant
 from onyx.file_processing.extract_file_text import ALL_ACCEPTED_FILE_EXTENSIONS
 from onyx.file_processing.extract_file_text import docx_to_text_and_images
 from onyx.file_processing.extract_file_text import extract_file_text
@@ -143,17 +142,15 @@ def response_call() -> bytes:
         # Store images for later processing
         sections: list[TextSection | ImageSection] = []
         try:
-            with get_session_with_current_tenant() as db_session:
-                section, embedded_id = store_image_and_create_section(
-                    db_session=db_session,
-                    image_data=response_call(),
-                    file_id=file_id,
-                    display_name=file_name,
-                    media_type=mime_type,
-                    file_origin=FileOrigin.CONNECTOR,
-                    link=link,
-                )
-                sections.append(section)
+            section, embedded_id = store_image_and_create_section(
+                image_data=response_call(),
+                file_id=file_id,
+                display_name=file_name,
+                media_type=mime_type,
+                file_origin=FileOrigin.CONNECTOR,
+                link=link,
+            )
+            sections.append(section)
         except Exception as e:
             logger.error(f"Failed to process image {file_name}: {e}")
         return sections
@@ -216,16 +213,14 @@ def response_call() -> bytes:
 
         # Process embedded images in the PDF
         try:
-            with get_session_with_current_tenant() as db_session:
-                for idx, (img_data, img_name) in enumerate(images):
-                    section, embedded_id = store_image_and_create_section(
-                        db_session=db_session,
-                        image_data=img_data,
-                        file_id=f"{file_id}_img_{idx}",
-                        display_name=img_name or f"{file_name} - image {idx}",
-                        file_origin=FileOrigin.CONNECTOR,
-                    )
-                    pdf_sections.append(section)
+            for idx, (img_data, img_name) in enumerate(images):
+                section, embedded_id = store_image_and_create_section(
+                    image_data=img_data,
+                    file_id=f"{file_id}_img_{idx}",
+                    display_name=img_name or f"{file_name} - image {idx}",
+                    file_origin=FileOrigin.CONNECTOR,
+                )
+                pdf_sections.append(section)
         except Exception as e:
             logger.error(f"Failed to process PDF images in {file_name}: {e}")
         return pdf_sections
diff --git a/backend/onyx/connectors/google_site/connector.py b/backend/onyx/connectors/google_site/connector.py
index b5755830342..411739f8eb3 100644
--- a/backend/onyx/connectors/google_site/connector.py
+++ b/backend/onyx/connectors/google_site/connector.py
@@ -12,7 +12,6 @@
 from onyx.connectors.interfaces import LoadConnector
 from onyx.connectors.models import Document
 from onyx.connectors.models import TextSection
-from onyx.db.engine.sql_engine import get_session_with_current_tenant
 from onyx.file_processing.extract_file_text import load_files_from_zip
 from onyx.file_processing.extract_file_text import read_text_file
 from onyx.file_processing.html_utils import web_html_cleanup
@@ -68,10 +67,7 @@ def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None
     def load_from_state(self) -> GenerateDocumentsOutput:
         documents: list[Document] = []
 
-        with get_session_with_current_tenant() as db_session:
-            file_content_io = get_default_file_store(db_session).read_file(
-                self.zip_path, mode="b"
-            )
+        file_content_io = get_default_file_store().read_file(self.zip_path, mode="b")
 
         # load the HTML files
         files = load_files_from_zip(file_content_io)
diff --git a/backend/onyx/connectors/models.py b/backend/onyx/connectors/models.py
index 0addde563d1..4a3b10f0a42 100644
--- a/backend/onyx/connectors/models.py
+++ b/backend/onyx/connectors/models.py
@@ -11,6 +11,7 @@
 from onyx.configs.constants import DocumentSource
 from onyx.configs.constants import INDEX_SEPARATOR
 from onyx.configs.constants import RETURN_SEPARATOR
+from onyx.db.enums import IndexModelStatus
 from onyx.utils.text_processing import make_url_compatible
 
 
@@ -363,6 +364,10 @@ def check_failed_fields(cls, values: dict) -> dict:
         return values
 
 
+class ConnectorStopSignal(Exception):
+    """A custom exception used to signal a stop in processing."""
+
+
 class OnyxMetadata(BaseModel):
     # Note that doc_id cannot be overriden here as it may cause issues
     # with the display functionalities in the UI. Ask @chris if clarification is needed.
@@ -373,3 +378,24 @@ class OnyxMetadata(BaseModel):
     secondary_owners: list[BasicExpertInfo] | None = None
     doc_updated_at: datetime | None = None
     title: str | None = None
+
+
+class DocExtractionContext(BaseModel):
+    index_name: str
+    cc_pair_id: int
+    connector_id: int
+    credential_id: int
+    source: DocumentSource
+    earliest_index_time: float
+    from_beginning: bool
+    is_primary: bool
+    should_fetch_permissions_during_indexing: bool
+    search_settings_status: IndexModelStatus
+    doc_extraction_complete_batch_num: int | None
+
+
+class DocIndexingContext(BaseModel):
+    batches_done: int
+    total_failures: int
+    net_doc_change: int
+    total_chunks: int
diff --git a/backend/onyx/connectors/notion/connector.py b/backend/onyx/connectors/notion/connector.py
index cbdda76f9ac..cca8811b938 100644
--- a/backend/onyx/connectors/notion/connector.py
+++ b/backend/onyx/connectors/notion/connector.py
@@ -267,7 +267,7 @@ def _recurse_properties(inner_dict: dict[str, Any]) -> str | None:
 
         result = ""
         for prop_name, prop in properties.items():
-            if not prop:
+            if not prop or not isinstance(prop, dict):
                 continue
 
             try:
diff --git a/backend/onyx/connectors/salesforce/connector.py b/backend/onyx/connectors/salesforce/connector.py
index cff9b42e08c..75bd8b3f842 100644
--- a/backend/onyx/connectors/salesforce/connector.py
+++ b/backend/onyx/connectors/salesforce/connector.py
@@ -992,7 +992,7 @@ def retrieve_all_slim_documents(
         doc_metadata_list: list[SlimDocument] = []
         for parent_object_type in self.parent_object_list:
             query = f"SELECT Id FROM {parent_object_type}"
-            query_result = self.sf_client.query_all(query)
+            query_result = self.sf_client.safe_query_all(query)
             doc_metadata_list.extend(
                 SlimDocument(
                     id=f"{ID_PREFIX}{instance_dict.get('Id', '')}",
diff --git a/backend/onyx/connectors/salesforce/onyx_salesforce.py b/backend/onyx/connectors/salesforce/onyx_salesforce.py
index 1295c13a3e4..f0ae30f0083 100644
--- a/backend/onyx/connectors/salesforce/onyx_salesforce.py
+++ b/backend/onyx/connectors/salesforce/onyx_salesforce.py
@@ -1,18 +1,31 @@
+import time
 from typing import Any
 
 from simple_salesforce import Salesforce
 from simple_salesforce import SFType
+from simple_salesforce.exceptions import SalesforceRefusedRequest
 
+from onyx.connectors.cross_connector_utils.rate_limit_wrapper import (
+    rate_limit_builder,
+)
 from onyx.connectors.salesforce.blacklist import SALESFORCE_BLACKLISTED_OBJECTS
 from onyx.connectors.salesforce.blacklist import SALESFORCE_BLACKLISTED_PREFIXES
 from onyx.connectors.salesforce.blacklist import SALESFORCE_BLACKLISTED_SUFFIXES
 from onyx.connectors.salesforce.salesforce_calls import get_object_by_id_query
 from onyx.utils.logger import setup_logger
+from onyx.utils.retry_wrapper import retry_builder
 
 
 logger = setup_logger()
 
 
+def is_salesforce_rate_limit_error(exception: Exception) -> bool:
+    """Check if an exception is a Salesforce rate limit error."""
+    return isinstance(
+        exception, SalesforceRefusedRequest
+    ) and "REQUEST_LIMIT_EXCEEDED" in str(exception)
+
+
 class OnyxSalesforce(Salesforce):
     SOQL_MAX_SUBQUERIES = 20
 
@@ -52,6 +65,48 @@ def is_blacklisted(self, object_type: str) -> bool:
 
         return False
 
+    @retry_builder(
+        tries=5,
+        delay=20,
+        backoff=1.5,
+        max_delay=60,
+        exceptions=(SalesforceRefusedRequest,),
+    )
+    @rate_limit_builder(max_calls=50, period=60)
+    def safe_query(self, query: str, **kwargs: Any) -> dict[str, Any]:
+        """Wrapper around the original query method with retry logic and rate limiting."""
+        try:
+            return super().query(query, **kwargs)
+        except SalesforceRefusedRequest as e:
+            if is_salesforce_rate_limit_error(e):
+                logger.warning(
+                    f"Salesforce rate limit exceeded for query: {query[:100]}..."
+                )
+                # Add additional delay for rate limit errors
+                time.sleep(5)
+            raise
+
+    @retry_builder(
+        tries=5,
+        delay=20,
+        backoff=1.5,
+        max_delay=60,
+        exceptions=(SalesforceRefusedRequest,),
+    )
+    @rate_limit_builder(max_calls=50, period=60)
+    def safe_query_all(self, query: str, **kwargs: Any) -> dict[str, Any]:
+        """Wrapper around the original query_all method with retry logic and rate limiting."""
+        try:
+            return super().query_all(query, **kwargs)
+        except SalesforceRefusedRequest as e:
+            if is_salesforce_rate_limit_error(e):
+                logger.warning(
+                    f"Salesforce rate limit exceeded for query_all: {query[:100]}..."
+                )
+                # Add additional delay for rate limit errors
+                time.sleep(5)
+            raise
+
     @staticmethod
     def _make_child_objects_by_id_query(
         object_id: str,
@@ -99,7 +154,7 @@ def query_object(
 
         queryable_fields = type_to_queryable_fields[object_type]
         query = get_object_by_id_query(object_id, object_type, queryable_fields)
-        result = self.query(query)
+        result = self.safe_query(query)
         if not result:
             return None
 
@@ -151,7 +206,7 @@ def get_child_objects_by_id(
                 )
 
                 try:
-                    result = self.query(query)
+                    result = self.safe_query(query)
                 except Exception:
                     logger.exception(f"Query failed: {query=}")
                 else:
@@ -189,10 +244,25 @@ def get_child_objects_by_id(
 
         return child_records
 
+    @retry_builder(
+        tries=3,
+        delay=1,
+        backoff=2,
+        exceptions=(SalesforceRefusedRequest,),
+    )
     def describe_type(self, name: str) -> Any:
         sf_object = SFType(name, self.session_id, self.sf_instance)
-        result = sf_object.describe()
-        return result
+        try:
+            result = sf_object.describe()
+            return result
+        except SalesforceRefusedRequest as e:
+            if is_salesforce_rate_limit_error(e):
+                logger.warning(
+                    f"Salesforce rate limit exceeded for describe_type: {name}"
+                )
+                # Add additional delay for rate limit errors
+                time.sleep(3)
+            raise
 
     def get_queryable_fields_by_type(self, name: str) -> list[str]:
         object_description = self.describe_type(name)
diff --git a/backend/onyx/connectors/salesforce/salesforce_calls.py b/backend/onyx/connectors/salesforce/salesforce_calls.py
index 1ac055621c3..c58fec3a45d 100644
--- a/backend/onyx/connectors/salesforce/salesforce_calls.py
+++ b/backend/onyx/connectors/salesforce/salesforce_calls.py
@@ -1,5 +1,6 @@
 import gc
 import os
+import time
 from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime
 
@@ -7,13 +8,25 @@
 from simple_salesforce import Salesforce
 from simple_salesforce.bulk2 import SFBulk2Handler
 from simple_salesforce.bulk2 import SFBulk2Type
+from simple_salesforce.exceptions import SalesforceRefusedRequest
 
+from onyx.connectors.cross_connector_utils.rate_limit_wrapper import (
+    rate_limit_builder,
+)
 from onyx.connectors.interfaces import SecondsSinceUnixEpoch
 from onyx.utils.logger import setup_logger
+from onyx.utils.retry_wrapper import retry_builder
 
 logger = setup_logger()
 
 
+def is_salesforce_rate_limit_error(exception: Exception) -> bool:
+    """Check if an exception is a Salesforce rate limit error."""
+    return isinstance(
+        exception, SalesforceRefusedRequest
+    ) and "REQUEST_LIMIT_EXCEEDED" in str(exception)
+
+
 def _build_last_modified_time_filter_for_salesforce(
     start: SecondsSinceUnixEpoch | None, end: SecondsSinceUnixEpoch | None
 ) -> str:
@@ -71,6 +84,14 @@ def get_object_by_id_query(
     return query
 
 
+@retry_builder(
+    tries=5,
+    delay=2,
+    backoff=2,
+    max_delay=60,
+    exceptions=(SalesforceRefusedRequest,),
+)
+@rate_limit_builder(max_calls=50, period=60)
 def _object_type_has_api_data(
     sf_client: Salesforce, sf_type: str, time_filter: str
 ) -> bool:
@@ -82,6 +103,15 @@ def _object_type_has_api_data(
         result = sf_client.query(query)
         if result["totalSize"] == 0:
             return False
+    except SalesforceRefusedRequest as e:
+        if is_salesforce_rate_limit_error(e):
+            logger.warning(
+                f"Salesforce rate limit exceeded for object type check: {sf_type}"
+            )
+            # Add additional delay for rate limit errors
+            time.sleep(3)
+        raise
+
     except Exception as e:
         if "OPERATION_TOO_LARGE" not in str(e):
             logger.warning(f"Object type {sf_type} doesn't support query: {e}")
diff --git a/backend/onyx/context/search/postprocessing/postprocessing.py b/backend/onyx/context/search/postprocessing/postprocessing.py
index 8115e6c3c11..ee7e17e3a4c 100644
--- a/backend/onyx/context/search/postprocessing/postprocessing.py
+++ b/backend/onyx/context/search/postprocessing/postprocessing.py
@@ -25,7 +25,6 @@
 from onyx.context.search.models import RerankingDetails
 from onyx.context.search.models import RerankMetricsContainer
 from onyx.context.search.models import SearchQuery
-from onyx.db.engine.sql_engine import get_session_with_current_tenant
 from onyx.document_index.document_index_utils import (
     translate_boost_count_to_multiplier,
 )
@@ -70,18 +69,16 @@ def process_image_chunk(chunk: InferenceChunk) -> tuple[str, str]:
             logger.debug(
                 f"Processing image chunk with ID: {chunk.unique_id}, image: {chunk.image_file_id}"
             )
-            with get_session_with_current_tenant() as db_session:
-                file_record = get_default_file_store(db_session).read_file(
-                    cast(str, chunk.image_file_id), mode="b"
-                )
-                if not file_record:
-                    logger.error(f"Image file not found: {chunk.image_file_id}")
-                    raise Exception("File not found")
-                file_content = file_record.read()
-                image_base64 = base64.b64encode(file_content).decode()
-                logger.debug(
-                    f"Successfully loaded image data for {chunk.image_file_id}"
-                )
+
+            file_record = get_default_file_store().read_file(
+                cast(str, chunk.image_file_id), mode="b"
+            )
+            if not file_record:
+                logger.error(f"Image file not found: {chunk.image_file_id}")
+                raise Exception("File not found")
+            file_content = file_record.read()
+            image_base64 = base64.b64encode(file_content).decode()
+            logger.debug(f"Successfully loaded image data for {chunk.image_file_id}")
 
             messages: list[BaseMessage] = [
                 SystemMessage(content=IMAGE_ANALYSIS_SYSTEM_PROMPT),
diff --git a/backend/onyx/db/chat.py b/backend/onyx/db/chat.py
index c2a790c6832..02801f5ae64 100644
--- a/backend/onyx/db/chat.py
+++ b/backend/onyx/db/chat.py
@@ -229,7 +229,7 @@ def delete_messages_and_files_from_chat_session(
         delete_tool_call_for_message_id(message_id=id, db_session=db_session)
         delete_search_doc_message_relationship(message_id=id, db_session=db_session)
 
-        file_store = get_default_file_store(db_session)
+        file_store = get_default_file_store()
         for file_info in files or []:
             file_store.delete_file(file_id=file_info.get("id"))
 
diff --git a/backend/onyx/db/connector_credential_pair.py b/backend/onyx/db/connector_credential_pair.py
index 040a43ec909..35a3d0bb632 100644
--- a/backend/onyx/db/connector_credential_pair.py
+++ b/backend/onyx/db/connector_credential_pair.py
@@ -264,6 +264,7 @@ def get_connector_credential_pair_from_id_for_user(
 def get_connector_credential_pair_from_id(
     db_session: Session,
     cc_pair_id: int,
+    eager_load_connector: bool = False,
     eager_load_credential: bool = False,
 ) -> ConnectorCredentialPair | None:
     stmt = select(ConnectorCredentialPair).distinct()
@@ -271,6 +272,8 @@ def get_connector_credential_pair_from_id(
 
     if eager_load_credential:
         stmt = stmt.options(joinedload(ConnectorCredentialPair.credential))
+    if eager_load_connector:
+        stmt = stmt.options(joinedload(ConnectorCredentialPair.connector))
 
     result = db_session.execute(stmt)
     return result.scalar_one_or_none()
diff --git a/backend/onyx/db/document.py b/backend/onyx/db/document.py
index 729cbd4f51e..9043b9e4cc5 100644
--- a/backend/onyx/db/document.py
+++ b/backend/onyx/db/document.py
@@ -849,7 +849,9 @@ def fetch_chunk_counts_for_documents(
     # Create a dictionary of document_id to chunk_count
     chunk_counts = {str(row.id): row.chunk_count or 0 for row in results}
 
-    # Return a list of tuples, using 0 for documents not found in the database
+    # Return a list of tuples, preserving `None` for documents not found or with
+    # an unknown chunk count. Callers should handle the `None` case and fall
+    # back to an existence check against the vector DB if necessary.
     return [(doc_id, chunk_counts.get(doc_id, 0)) for doc_id in document_ids]
 
 
diff --git a/backend/onyx/db/engine/sql_engine.py b/backend/onyx/db/engine/sql_engine.py
index 459afb9d849..cad4cb4647d 100644
--- a/backend/onyx/db/engine/sql_engine.py
+++ b/backend/onyx/db/engine/sql_engine.py
@@ -305,6 +305,18 @@ def get_session_with_current_tenant() -> Generator[Session, None, None]:
         yield session
 
 
+@contextmanager
+def get_session_with_current_tenant_if_none(
+    session: Session | None,
+) -> Generator[Session, None, None]:
+    if session is None:
+        tenant_id = get_current_tenant_id()
+        with get_session_with_tenant(tenant_id=tenant_id) as session:
+            yield session
+    else:
+        yield session
+
+
 # Used in multi tenant mode when need to refer to the shared `public` schema
 @contextmanager
 def get_session_with_shared_schema() -> Generator[Session, None, None]:
diff --git a/backend/onyx/db/file_record.py b/backend/onyx/db/file_record.py
index 8089fdf585a..d7191f0621f 100644
--- a/backend/onyx/db/file_record.py
+++ b/backend/onyx/db/file_record.py
@@ -43,6 +43,17 @@ def get_filerecord_by_file_id(
     return filestore
 
 
+def get_filerecord_by_prefix(
+    prefix: str,
+    db_session: Session,
+) -> list[FileRecord]:
+    if not prefix:
+        return db_session.query(FileRecord).all()
+    return (
+        db_session.query(FileRecord).filter(FileRecord.file_id.like(f"{prefix}%")).all()
+    )
+
+
 def delete_filerecord_by_file_id(
     file_id: str,
     db_session: Session,
diff --git a/backend/onyx/db/index_attempt.py b/backend/onyx/db/index_attempt.py
index ab58390f9b9..295acd097a5 100644
--- a/backend/onyx/db/index_attempt.py
+++ b/backend/onyx/db/index_attempt.py
@@ -28,6 +28,8 @@
 from onyx.utils.telemetry import optional_telemetry
 from onyx.utils.telemetry import RecordType
 
+# from sqlalchemy.sql.selectable import Select
+
 # Comment out unused imports that cause mypy errors
 # from onyx.auth.models import UserRole
 # from onyx.configs.constants import MAX_LAST_VALID_CHECKPOINT_AGE_SECONDS
@@ -95,23 +97,52 @@ def get_recent_attempts_for_cc_pair(
 
 
 def get_index_attempt(
-    db_session: Session, index_attempt_id: int
+    db_session: Session,
+    index_attempt_id: int,
+    eager_load_cc_pair: bool = False,
+    eager_load_search_settings: bool = False,
 ) -> IndexAttempt | None:
     stmt = select(IndexAttempt).where(IndexAttempt.id == index_attempt_id)
+    if eager_load_cc_pair:
+        stmt = stmt.options(
+            joinedload(IndexAttempt.connector_credential_pair).joinedload(
+                ConnectorCredentialPair.connector
+            )
+        )
+        stmt = stmt.options(
+            joinedload(IndexAttempt.connector_credential_pair).joinedload(
+                ConnectorCredentialPair.credential
+            )
+        )
+    if eager_load_search_settings:
+        stmt = stmt.options(joinedload(IndexAttempt.search_settings))
     return db_session.scalars(stmt).first()
 
 
+def count_error_rows_for_index_attempt(
+    index_attempt_id: int,
+    db_session: Session,
+) -> int:
+    return (
+        db_session.query(IndexAttemptError)
+        .filter(IndexAttemptError.index_attempt_id == index_attempt_id)
+        .count()
+    )
+
+
 def create_index_attempt(
     connector_credential_pair_id: int,
     search_settings_id: int,
     db_session: Session,
     from_beginning: bool = False,
+    celery_task_id: str | None = None,
 ) -> int:
     new_attempt = IndexAttempt(
         connector_credential_pair_id=connector_credential_pair_id,
         search_settings_id=search_settings_id,
         from_beginning=from_beginning,
         status=IndexingStatus.NOT_STARTED,
+        celery_task_id=celery_task_id,
     )
     db_session.add(new_attempt)
     db_session.commit()
@@ -247,7 +278,7 @@ def mark_attempt_in_progress(
 def mark_attempt_succeeded(
     index_attempt_id: int,
     db_session: Session,
-) -> None:
+) -> IndexAttempt:
     try:
         attempt = db_session.execute(
             select(IndexAttempt)
@@ -256,6 +287,7 @@ def mark_attempt_succeeded(
         ).scalar_one()
 
         attempt.status = IndexingStatus.SUCCESS
+        attempt.celery_task_id = None
         db_session.commit()
 
         # Add telemetry for index attempt status change
@@ -267,6 +299,7 @@ def mark_attempt_succeeded(
                 "cc_pair_id": attempt.connector_credential_pair_id,
             },
         )
+        return attempt
     except Exception:
         db_session.rollback()
         raise
@@ -275,7 +308,7 @@ def mark_attempt_succeeded(
 def mark_attempt_partially_succeeded(
     index_attempt_id: int,
     db_session: Session,
-) -> None:
+) -> IndexAttempt:
     try:
         attempt = db_session.execute(
             select(IndexAttempt)
@@ -284,6 +317,7 @@ def mark_attempt_partially_succeeded(
         ).scalar_one()
 
         attempt.status = IndexingStatus.COMPLETED_WITH_ERRORS
+        attempt.celery_task_id = None
         db_session.commit()
 
         # Add telemetry for index attempt status change
@@ -295,6 +329,7 @@ def mark_attempt_partially_succeeded(
                 "cc_pair_id": attempt.connector_credential_pair_id,
             },
         )
+        return attempt
     except Exception:
         db_session.rollback()
         raise
@@ -350,6 +385,7 @@ def mark_attempt_failed(
         attempt.status = IndexingStatus.FAILED
         attempt.error_msg = failure_reason
         attempt.full_exception_trace = full_exception_trace
+        attempt.celery_task_id = None
         db_session.commit()
 
         # Add telemetry for index attempt status change
@@ -373,16 +409,22 @@ def update_docs_indexed(
     new_docs_indexed: int,
     docs_removed_from_index: int,
 ) -> None:
+    """Updates the docs_indexed and new_docs_indexed fields of an index attempt.
+    Adds the given values to the current values in the db"""
     try:
         attempt = db_session.execute(
             select(IndexAttempt)
             .where(IndexAttempt.id == index_attempt_id)
-            .with_for_update()
+            .with_for_update()  # Locks the row when we try to update
         ).scalar_one()
 
-        attempt.total_docs_indexed = total_docs_indexed
-        attempt.new_docs_indexed = new_docs_indexed
-        attempt.docs_removed_from_index = docs_removed_from_index
+        attempt.total_docs_indexed = (
+            attempt.total_docs_indexed or 0
+        ) + total_docs_indexed
+        attempt.new_docs_indexed = (attempt.new_docs_indexed or 0) + new_docs_indexed
+        attempt.docs_removed_from_index = (
+            attempt.docs_removed_from_index or 0
+        ) + docs_removed_from_index
         db_session.commit()
     except Exception:
         db_session.rollback()
diff --git a/backend/onyx/db/indexing_coordination.py b/backend/onyx/db/indexing_coordination.py
new file mode 100644
index 00000000000..0da48e9b948
--- /dev/null
+++ b/backend/onyx/db/indexing_coordination.py
@@ -0,0 +1,307 @@
+"""Database-based indexing coordination to replace Redis fencing."""
+
+from pydantic import BaseModel
+from sqlalchemy import select
+from sqlalchemy.exc import SQLAlchemyError
+from sqlalchemy.orm import Session
+
+from onyx.db.engine.time_utils import get_db_current_time
+from onyx.db.enums import IndexingStatus
+from onyx.db.index_attempt import count_error_rows_for_index_attempt
+from onyx.db.index_attempt import create_index_attempt
+from onyx.db.index_attempt import get_index_attempt
+from onyx.db.models import IndexAttempt
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+INDEXING_PROGRESS_TIMEOUT_HOURS = 6
+
+
+class CoordinationStatus(BaseModel):
+    """Status of an indexing attempt's coordination."""
+
+    found: bool
+    total_batches: int | None
+    completed_batches: int
+    total_failures: int
+    total_docs: int
+    total_chunks: int
+    status: IndexingStatus | None = None
+    cancellation_requested: bool = False
+
+
+class IndexingCoordination:
+    """Database-based coordination for indexing tasks, replacing Redis fencing."""
+
+    @staticmethod
+    def try_create_index_attempt(
+        db_session: Session,
+        cc_pair_id: int,
+        search_settings_id: int,
+        celery_task_id: str,
+        from_beginning: bool = False,
+    ) -> int | None:
+        """
+        Try to create a new index attempt for the given CC pair and search settings.
+        Returns the index_attempt_id if successful, None if another attempt is already running.
+
+        This replaces the Redis fencing mechanism by using database constraints
+        and transactions to prevent duplicate attempts.
+        """
+        try:
+            # Check for existing active attempts (this is the "fence" check)
+            existing_attempt = db_session.execute(
+                select(IndexAttempt)
+                .where(
+                    IndexAttempt.connector_credential_pair_id == cc_pair_id,
+                    IndexAttempt.search_settings_id == search_settings_id,
+                    IndexAttempt.status.in_(
+                        [IndexingStatus.NOT_STARTED, IndexingStatus.IN_PROGRESS]
+                    ),
+                )
+                .with_for_update(nowait=True)
+            ).first()
+
+            if existing_attempt:
+                logger.info(
+                    f"Indexing already in progress: "
+                    f"cc_pair={cc_pair_id} "
+                    f"search_settings={search_settings_id} "
+                    f"existing_attempt={existing_attempt[0].id}"
+                )
+                return None
+
+            # Create new index attempt (this is setting the "fence")
+            attempt_id = create_index_attempt(
+                connector_credential_pair_id=cc_pair_id,
+                search_settings_id=search_settings_id,
+                from_beginning=from_beginning,
+                db_session=db_session,
+                celery_task_id=celery_task_id,
+            )
+
+            logger.info(
+                f"Created Index Attempt: "
+                f"cc_pair={cc_pair_id} "
+                f"search_settings={search_settings_id} "
+                f"attempt_id={attempt_id} "
+                f"celery_task_id={celery_task_id}"
+            )
+
+            return attempt_id
+
+        except SQLAlchemyError as e:
+            logger.info(
+                f"Failed to create index attempt (likely race condition): "
+                f"cc_pair={cc_pair_id} "
+                f"search_settings={search_settings_id} "
+                f"error={str(e)}"
+            )
+            db_session.rollback()
+            return None
+
+    @staticmethod
+    def check_cancellation_requested(
+        db_session: Session,
+        index_attempt_id: int,
+    ) -> bool:
+        """
+        Check if cancellation has been requested for this indexing attempt.
+        This replaces Redis termination signals.
+        """
+        attempt = get_index_attempt(db_session, index_attempt_id)
+        return attempt.cancellation_requested if attempt else False
+
+    @staticmethod
+    def request_cancellation(
+        db_session: Session,
+        index_attempt_id: int,
+    ) -> None:
+        """
+        Request cancellation of an indexing attempt.
+        This replaces Redis termination signals.
+        """
+        attempt = get_index_attempt(db_session, index_attempt_id)
+        if attempt:
+            attempt.cancellation_requested = True
+            db_session.commit()
+
+            logger.info(f"Requested cancellation for attempt {index_attempt_id}")
+
+    @staticmethod
+    def set_total_batches(
+        db_session: Session,
+        index_attempt_id: int,
+        total_batches: int,
+    ) -> None:
+        """
+        Set the total number of batches for this indexing attempt.
+        Called by docfetching when extraction is complete.
+        """
+        attempt = get_index_attempt(db_session, index_attempt_id)
+        if attempt:
+            attempt.total_batches = total_batches
+            db_session.commit()
+
+            logger.info(
+                f"Set total batches: attempt={index_attempt_id} total={total_batches}"
+            )
+
+    @staticmethod
+    def update_batch_completion_and_docs(
+        db_session: Session,
+        index_attempt_id: int,
+        total_docs_indexed: int,
+        new_docs_indexed: int,
+        total_chunks: int,
+    ) -> tuple[int, int | None]:
+        """
+        Update batch completion and document counts atomically.
+        Returns (completed_batches, total_batches).
+        This extends the existing update_docs_indexed pattern.
+        """
+        try:
+            attempt = db_session.execute(
+                select(IndexAttempt)
+                .where(IndexAttempt.id == index_attempt_id)
+                .with_for_update()  # Same pattern as existing update_docs_indexed
+            ).scalar_one()
+
+            # Existing document count updates
+            attempt.total_docs_indexed = (
+                attempt.total_docs_indexed or 0
+            ) + total_docs_indexed
+            attempt.new_docs_indexed = (
+                attempt.new_docs_indexed or 0
+            ) + new_docs_indexed
+
+            # New coordination updates
+            attempt.completed_batches = (attempt.completed_batches or 0) + 1
+            attempt.total_chunks = (attempt.total_chunks or 0) + total_chunks
+
+            db_session.commit()
+
+            logger.info(
+                f"Updated batch completion: "
+                f"attempt={index_attempt_id} "
+                f"completed={attempt.completed_batches} "
+                f"total={attempt.total_batches} "
+                f"docs={total_docs_indexed} "
+            )
+
+            return attempt.completed_batches, attempt.total_batches
+
+        except Exception:
+            db_session.rollback()
+            logger.exception(
+                f"Failed to update batch completion for attempt {index_attempt_id}"
+            )
+            raise
+
+    @staticmethod
+    def get_coordination_status(
+        db_session: Session,
+        index_attempt_id: int,
+    ) -> CoordinationStatus:
+        """
+        Get the current coordination status for an indexing attempt.
+        This replaces reading FileStore state files.
+        """
+        attempt = get_index_attempt(db_session, index_attempt_id)
+        if not attempt:
+            return CoordinationStatus(
+                found=False,
+                total_batches=None,
+                completed_batches=0,
+                total_failures=0,
+                total_docs=0,
+                total_chunks=0,
+                status=None,
+                cancellation_requested=False,
+            )
+
+        return CoordinationStatus(
+            found=True,
+            total_batches=attempt.total_batches,
+            completed_batches=attempt.completed_batches,
+            total_failures=count_error_rows_for_index_attempt(
+                index_attempt_id, db_session
+            ),
+            total_docs=attempt.total_docs_indexed or 0,
+            total_chunks=attempt.total_chunks,
+            status=attempt.status,
+            cancellation_requested=attempt.cancellation_requested,
+        )
+
+    @staticmethod
+    def get_orphaned_index_attempt_ids(db_session: Session) -> list[int]:
+        """
+        Gets a list of potentially orphaned index attempts.
+        These are attempts in non-terminal state that have task IDs but may have died.
+
+        This replaces the old get_unfenced_index_attempt_ids function.
+        The actual orphan detection requires checking with Celery, which should be
+        done by the caller.
+        """
+        # Find attempts that are active and have task IDs
+        # The caller needs to check each one with Celery to confirm orphaned status
+        active_attempts = (
+            db_session.execute(
+                select(IndexAttempt).where(
+                    IndexAttempt.status.in_(
+                        [IndexingStatus.NOT_STARTED, IndexingStatus.IN_PROGRESS]
+                    ),
+                    IndexAttempt.celery_task_id.isnot(None),
+                )
+            )
+            .scalars()
+            .all()
+        )
+
+        return [attempt.id for attempt in active_attempts]
+
+    @staticmethod
+    def update_progress_tracking(
+        db_session: Session,
+        index_attempt_id: int,
+        current_batches_completed: int,
+        timeout_hours: int = INDEXING_PROGRESS_TIMEOUT_HOURS,
+    ) -> bool:
+        """
+        Update progress tracking for stall detection.
+        Returns True if sufficient progress was made, False if stalled.
+        """
+
+        attempt = get_index_attempt(db_session, index_attempt_id)
+        if not attempt:
+            logger.error(f"Index attempt {index_attempt_id} not found in database")
+            return False
+
+        current_time = get_db_current_time(db_session)
+
+        # No progress - check if this is the first time tracking
+        if attempt.last_progress_time is None:
+            # First time tracking - initialize
+            attempt.last_progress_time = current_time
+            attempt.last_batches_completed_count = current_batches_completed
+            db_session.commit()
+            return True
+
+        time_elapsed = (current_time - attempt.last_progress_time).total_seconds()
+        # only actually write to db every timeout_hours/2
+        # this ensure thats at most timeout_hours will pass with no activity
+        if time_elapsed < timeout_hours * 1800:
+            return True
+
+        # Check if progress has been made
+        if current_batches_completed <= attempt.last_batches_completed_count:
+            # if between timeout_hours/2 and timeout_hours has passed
+            # without an update, we consider the attempt stalled
+            return False
+
+        # Progress made - update tracking
+        attempt.last_progress_time = current_time
+        attempt.last_batches_completed_count = current_batches_completed
+        db_session.commit()
+        return True
diff --git a/backend/onyx/db/models.py b/backend/onyx/db/models.py
index 978159ddf2e..ad9ecd2964f 100644
--- a/backend/onyx/db/models.py
+++ b/backend/onyx/db/models.py
@@ -1612,9 +1612,7 @@ def large_chunks_enabled(self) -> bool:
 
     @property
     def final_embedding_dim(self) -> int:
-        if self.reduced_dimension:
-            return self.reduced_dimension
-        return self.model_dim
+        return self.reduced_dimension or self.model_dim
 
     @staticmethod
     def can_use_large_chunks(
@@ -1635,7 +1633,7 @@ def can_use_large_chunks(
 
 class IndexAttempt(Base):
     """
-    Represents an attempt to index a group of 1 or more documents from a
+    Represents an attempt to index a group of 0 or more documents from a
     source. For example, a single pull from Google Drive, a single event from
     slack event API, or a single website crawl.
     """
@@ -1683,6 +1681,30 @@ class IndexAttempt(Base):
     # can be taken to the FileStore to grab the actual checkpoint value
     checkpoint_pointer: Mapped[str | None] = mapped_column(String, nullable=True)
 
+    # NEW: Database-based coordination fields (replacing Redis fencing)
+    celery_task_id: Mapped[str | None] = mapped_column(String, nullable=True)
+    cancellation_requested: Mapped[bool] = mapped_column(Boolean, default=False)
+
+    # NEW: Batch coordination fields (replacing FileStore state)
+    total_batches: Mapped[int | None] = mapped_column(Integer, nullable=True)
+    completed_batches: Mapped[int] = mapped_column(Integer, default=0)
+    # TODO: unused, remove this column
+    total_failures_batch_level: Mapped[int] = mapped_column(Integer, default=0)
+    total_chunks: Mapped[int] = mapped_column(Integer, default=0)
+
+    # Progress tracking for stall detection
+    last_progress_time: Mapped[datetime.datetime | None] = mapped_column(
+        DateTime(timezone=True), nullable=True
+    )
+    last_batches_completed_count: Mapped[int] = mapped_column(Integer, default=0)
+
+    # NEW: Heartbeat tracking for worker liveness detection
+    heartbeat_counter: Mapped[int] = mapped_column(Integer, default=0)
+    last_heartbeat_value: Mapped[int] = mapped_column(Integer, default=0)
+    last_heartbeat_time: Mapped[datetime.datetime | None] = mapped_column(
+        DateTime(timezone=True), nullable=True
+    )
+
     time_created: Mapped[datetime.datetime] = mapped_column(
         DateTime(timezone=True),
         server_default=func.now(),
@@ -1733,6 +1755,13 @@ class IndexAttempt(Base):
             "status",
             desc("time_updated"),
         ),
+        # NEW: Index for coordination queries
+        Index(
+            "ix_index_attempt_active_coordination",
+            "connector_credential_pair_id",
+            "search_settings_id",
+            "status",
+        ),
     )
 
     def __repr__(self) -> str:
@@ -1747,6 +1776,13 @@ def __repr__(self) -> str:
     def is_finished(self) -> bool:
         return self.status.is_terminal()
 
+    def is_coordination_complete(self) -> bool:
+        """Check if all batches have been processed"""
+        return (
+            self.total_batches is not None
+            and self.completed_batches >= self.total_batches
+        )
+
 
 class IndexAttemptError(Base):
     __tablename__ = "index_attempt_errors"
@@ -3151,7 +3187,7 @@ class PublicExternalUserGroup(Base):
 
 class UsageReport(Base):
     """This stores metadata about usage reports generated by admin including user who generated
-    them as well las the period they cover. The actual zip file of the report is stored as a lo
+    them as well as the period they cover. The actual zip file of the report is stored as a lo
     using the FileRecord
     """
 
diff --git a/backend/onyx/db/user_documents.py b/backend/onyx/db/user_documents.py
index d2048544d99..2959dc212cc 100644
--- a/backend/onyx/db/user_documents.py
+++ b/backend/onyx/db/user_documents.py
@@ -46,7 +46,7 @@ def create_user_files(
 
     # NOTE: At the moment, zip metadata is not used for user files.
     # Should revisit to decide whether this should be a feature.
-    upload_response = upload_files(files, db_session)
+    upload_response = upload_files(files)
     user_files = []
 
     for file_path, file in zip(upload_response.file_paths, files):
diff --git a/backend/onyx/document_index/interfaces.py b/backend/onyx/document_index/interfaces.py
index e61c45ef46d..ba5395a8d5b 100644
--- a/backend/onyx/document_index/interfaces.py
+++ b/backend/onyx/document_index/interfaces.py
@@ -45,7 +45,7 @@ class IndexBatchParams:
     Information necessary for efficiently indexing a batch of documents
     """
 
-    doc_id_to_previous_chunk_cnt: dict[str, int | None]
+    doc_id_to_previous_chunk_cnt: dict[str, int]
     doc_id_to_new_chunk_cnt: dict[str, int]
     tenant_id: str
     large_chunks_enabled: bool
diff --git a/backend/onyx/file_processing/image_utils.py b/backend/onyx/file_processing/image_utils.py
index 08c4bec010f..577ced63bbf 100644
--- a/backend/onyx/file_processing/image_utils.py
+++ b/backend/onyx/file_processing/image_utils.py
@@ -1,8 +1,6 @@
 from io import BytesIO
 from typing import Tuple
 
-from sqlalchemy.orm import Session
-
 from onyx.configs.constants import FileOrigin
 from onyx.connectors.models import ImageSection
 from onyx.file_store.file_store import get_default_file_store
@@ -12,7 +10,6 @@
 
 
 def store_image_and_create_section(
-    db_session: Session,
     image_data: bytes,
     file_id: str,
     display_name: str,
@@ -24,7 +21,6 @@ def store_image_and_create_section(
     Stores an image in FileStore and creates an ImageSection object without summarization.
 
     Args:
-        db_session: Database session
         image_data: Raw image bytes
         file_id: Base identifier for the file
         display_name: Human-readable name for the image
@@ -38,7 +34,7 @@ def store_image_and_create_section(
     """
     # Storage logic
     try:
-        file_store = get_default_file_store(db_session)
+        file_store = get_default_file_store()
         file_id = file_store.save_file(
             content=BytesIO(image_data),
             display_name=display_name,
diff --git a/backend/onyx/file_store/document_batch_storage.py b/backend/onyx/file_store/document_batch_storage.py
new file mode 100644
index 00000000000..1ac6b466afe
--- /dev/null
+++ b/backend/onyx/file_store/document_batch_storage.py
@@ -0,0 +1,228 @@
+import json
+from abc import ABC
+from abc import abstractmethod
+from enum import Enum
+from io import StringIO
+from typing import List
+from typing import Optional
+from typing import TypeAlias
+
+from pydantic import BaseModel
+
+from onyx.configs.constants import FileOrigin
+from onyx.connectors.models import DocExtractionContext
+from onyx.connectors.models import DocIndexingContext
+from onyx.connectors.models import Document
+from onyx.file_store.file_store import FileStore
+from onyx.file_store.file_store import get_default_file_store
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+class DocumentBatchStorageStateType(str, Enum):
+    EXTRACTION = "extraction"
+    INDEXING = "indexing"
+
+
+DocumentStorageState: TypeAlias = DocExtractionContext | DocIndexingContext
+
+STATE_TYPE_TO_MODEL: dict[str, type[DocumentStorageState]] = {
+    DocumentBatchStorageStateType.EXTRACTION.value: DocExtractionContext,
+    DocumentBatchStorageStateType.INDEXING.value: DocIndexingContext,
+}
+
+
+class BatchStoragePathInfo(BaseModel):
+    cc_pair_id: int
+    index_attempt_id: int
+    batch_num: int
+
+
+class DocumentBatchStorage(ABC):
+    """Abstract base class for document batch storage implementations."""
+
+    def __init__(self, cc_pair_id: int, index_attempt_id: int):
+        self.cc_pair_id = cc_pair_id
+        self.index_attempt_id = index_attempt_id
+        self.base_path = f"{self._per_cc_pair_base_path()}/{index_attempt_id}"
+
+    @abstractmethod
+    def store_batch(self, batch_num: int, documents: List[Document]) -> None:
+        """Store a batch of documents."""
+
+    @abstractmethod
+    def get_batch(self, batch_num: int) -> Optional[List[Document]]:
+        """Retrieve a batch of documents."""
+
+    @abstractmethod
+    def delete_batch_by_name(self, batch_file_name: str) -> None:
+        """Delete a specific batch."""
+
+    @abstractmethod
+    def delete_batch_by_num(self, batch_num: int) -> None:
+        """Delete a specific batch."""
+
+    @abstractmethod
+    def cleanup_all_batches(self) -> None:
+        """Clean up all batches and state for this index attempt."""
+
+    @abstractmethod
+    def get_all_batches_for_cc_pair(self) -> list[str]:
+        """Get all IDs of batches stored in the file store."""
+
+    @abstractmethod
+    def update_old_batches_to_new_index_attempt(self, batch_names: list[str]) -> None:
+        """Update all batches to the new index attempt."""
+        """
+        This is used when we need to re-issue docprocessing tasks for a new index attempt.
+        We need to update the batch file names to the new index attempt ID.
+        """
+
+    @abstractmethod
+    def extract_path_info(self, path: str) -> BatchStoragePathInfo | None:
+        """Extract path info from a path."""
+
+    def _serialize_documents(self, documents: list[Document]) -> str:
+        """Serialize documents to JSON string."""
+        # Use mode='json' to properly serialize datetime and other complex types
+        return json.dumps([doc.model_dump(mode="json") for doc in documents], indent=2)
+
+    def _deserialize_documents(self, data: str) -> list[Document]:
+        """Deserialize documents from JSON string."""
+        doc_dicts = json.loads(data)
+        return [Document.model_validate(doc_dict) for doc_dict in doc_dicts]
+
+    def _per_cc_pair_base_path(self) -> str:
+        """Get the base path for the cc pair."""
+        return f"iab/{self.cc_pair_id}"
+
+
+class FileStoreDocumentBatchStorage(DocumentBatchStorage):
+    """FileStore-based implementation of document batch storage."""
+
+    def __init__(self, cc_pair_id: int, index_attempt_id: int, file_store: FileStore):
+        super().__init__(cc_pair_id, index_attempt_id)
+        self.file_store = file_store
+
+    def _get_batch_file_name(self, batch_num: int) -> str:
+        """Generate file name for a document batch."""
+        return f"{self.base_path}/{batch_num}.json"
+
+    def store_batch(self, batch_num: int, documents: list[Document]) -> None:
+        """Store a batch of documents using FileStore."""
+        file_name = self._get_batch_file_name(batch_num)
+        try:
+            data = self._serialize_documents(documents)
+            content = StringIO(data)
+
+            self.file_store.save_file(
+                file_id=file_name,
+                content=content,
+                display_name=f"Document Batch {batch_num}",
+                file_origin=FileOrigin.OTHER,
+                file_type="application/json",
+                file_metadata={
+                    "batch_num": batch_num,
+                    "document_count": str(len(documents)),
+                },
+            )
+
+            logger.debug(
+                f"Stored batch {batch_num} with {len(documents)} documents to FileStore as {file_name}"
+            )
+        except Exception as e:
+            logger.error(f"Failed to store batch {batch_num}: {e}")
+            raise
+
+    def get_batch(self, batch_num: int) -> list[Document] | None:
+        """Retrieve a batch of documents from FileStore."""
+        file_name = self._get_batch_file_name(batch_num)
+        try:
+            # Check if file exists
+            if not self.file_store.has_file(
+                file_id=file_name,
+                file_origin=FileOrigin.OTHER,
+                file_type="application/json",
+            ):
+                logger.warning(
+                    f"Batch {batch_num} not found in FileStore with name {file_name}"
+                )
+                return None
+
+            content_io = self.file_store.read_file(file_name)
+            data = content_io.read().decode("utf-8")
+
+            documents = self._deserialize_documents(data)
+            logger.debug(
+                f"Retrieved batch {batch_num} with {len(documents)} documents from FileStore"
+            )
+            return documents
+        except Exception as e:
+            logger.error(f"Failed to retrieve batch {batch_num}: {e}")
+            raise
+
+    def delete_batch_by_name(self, batch_file_name: str) -> None:
+        """Delete a specific batch from FileStore."""
+        self.file_store.delete_file(batch_file_name)
+        logger.debug(f"Deleted batch {batch_file_name} from FileStore")
+
+    def delete_batch_by_num(self, batch_num: int) -> None:
+        """Delete a specific batch from FileStore."""
+        batch_file_name = self._get_batch_file_name(batch_num)
+        self.delete_batch_by_name(batch_file_name)
+        logger.debug(f"Deleted batch num {batch_num} {batch_file_name} from FileStore")
+
+    def cleanup_all_batches(self) -> None:
+        """Clean up all batches for this index attempt."""
+        for batch_file_name in self.get_all_batches_for_cc_pair():
+            self.delete_batch_by_name(batch_file_name)
+
+    def get_all_batches_for_cc_pair(self) -> list[str]:
+        """Get all IDs of batches stored in the file store for the cc pair
+        this batch store was initialized with.
+        This includes any batches left over from a previous
+        indexing attempt that need to be processed.
+        """
+        return [
+            file.file_id
+            for file in self.file_store.list_files_by_prefix(
+                self._per_cc_pair_base_path()
+            )
+        ]
+
+    def update_old_batches_to_new_index_attempt(self, batch_names: list[str]) -> None:
+        """Update all batches to the new index attempt."""
+        for batch_file_name in batch_names:
+            path_info = self.extract_path_info(batch_file_name)
+            if path_info is None:
+                continue
+            new_batch_file_name = self._get_batch_file_name(path_info.batch_num)
+            self.file_store.change_file_id(batch_file_name, new_batch_file_name)
+
+    def extract_path_info(self, path: str) -> BatchStoragePathInfo | None:
+        """Extract path info from a path."""
+        path_spl = path.split("/")
+        # TODO: remove this in a few months, just for backwards compatibility
+        if len(path_spl) == 3:
+            path_spl = ["iab"] + path_spl
+        try:
+            _, cc_pair_id, index_attempt_id, batch_num = path_spl
+            return BatchStoragePathInfo(
+                cc_pair_id=int(cc_pair_id),
+                index_attempt_id=int(index_attempt_id),
+                batch_num=int(batch_num.split(".")[0]),  # remove .json
+            )
+        except Exception as e:
+            logger.error(f"Failed to extract path info from {path}: {e}")
+            return None
+
+
+def get_document_batch_storage(
+    cc_pair_id: int, index_attempt_id: int
+) -> DocumentBatchStorage:
+    """Factory function to get the configured document batch storage implementation."""
+    # The get_default_file_store will now correctly use S3BackedFileStore
+    # or other configured stores based on environment variables
+    file_store = get_default_file_store()
+    return FileStoreDocumentBatchStorage(cc_pair_id, index_attempt_id, file_store)
diff --git a/backend/onyx/file_store/file_store.py b/backend/onyx/file_store/file_store.py
index 537c734bb16..e30757361e2 100644
--- a/backend/onyx/file_store/file_store.py
+++ b/backend/onyx/file_store/file_store.py
@@ -22,10 +22,14 @@
 from onyx.configs.app_configs import S3_FILE_STORE_PREFIX
 from onyx.configs.app_configs import S3_VERIFY_SSL
 from onyx.configs.constants import FileOrigin
+from onyx.db.engine.sql_engine import get_session_with_current_tenant
+from onyx.db.engine.sql_engine import get_session_with_current_tenant_if_none
 from onyx.db.file_record import delete_filerecord_by_file_id
 from onyx.db.file_record import get_filerecord_by_file_id
 from onyx.db.file_record import get_filerecord_by_file_id_optional
+from onyx.db.file_record import get_filerecord_by_prefix
 from onyx.db.file_record import upsert_filerecord
+from onyx.db.models import FileRecord
 from onyx.db.models import FileRecord as FileStoreModel
 from onyx.file_store.s3_key_utils import generate_s3_key
 from onyx.utils.file import FileWithMimeType
@@ -129,13 +133,29 @@ def get_file_with_mime_type(self, filename: str) -> FileWithMimeType | None:
         Get the file + parse out the mime type.
         """
 
+    @abstractmethod
+    def change_file_id(self, old_file_id: str, new_file_id: str) -> None:
+        """
+        Change the file ID of an existing file.
+
+        Parameters:
+        - old_file_id: Current file ID
+        - new_file_id: New file ID to assign
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def list_files_by_prefix(self, prefix: str) -> list[FileRecord]:
+        """
+        List all file IDs that start with the given prefix.
+        """
+
 
 class S3BackedFileStore(FileStore):
     """Isn't necessarily S3, but is any S3-compatible storage (e.g. MinIO)"""
 
     def __init__(
         self,
-        db_session: Session,
         bucket_name: str,
         aws_access_key_id: str | None = None,
         aws_secret_access_key: str | None = None,
@@ -144,7 +164,6 @@ def __init__(
         s3_prefix: str | None = None,
         s3_verify_ssl: bool = True,
     ) -> None:
-        self.db_session = db_session
         self._s3_client: S3Client | None = None
         self._bucket_name = bucket_name
         self._aws_access_key_id = aws_access_key_id
@@ -272,10 +291,12 @@ def has_file(
         file_id: str,
         file_origin: FileOrigin,
         file_type: str,
+        db_session: Session | None = None,
     ) -> bool:
-        file_record = get_filerecord_by_file_id_optional(
-            file_id=file_id, db_session=self.db_session
-        )
+        with get_session_with_current_tenant_if_none(db_session) as db_session:
+            file_record = get_filerecord_by_file_id_optional(
+                file_id=file_id, db_session=db_session
+            )
         return (
             file_record is not None
             and file_record.file_origin == file_origin
@@ -290,6 +311,7 @@ def save_file(
         file_type: str,
         file_metadata: dict[str, Any] | None = None,
         file_id: str | None = None,
+        db_session: Session | None = None,
     ) -> str:
         if file_id is None:
             file_id = str(uuid.uuid4())
@@ -314,27 +336,33 @@ def save_file(
             ContentType=file_type,
         )
 
-        # Save metadata to database
-        upsert_filerecord(
-            file_id=file_id,
-            display_name=display_name or file_id,
-            file_origin=file_origin,
-            file_type=file_type,
-            bucket_name=bucket_name,
-            object_key=s3_key,
-            db_session=self.db_session,
-            file_metadata=file_metadata,
-        )
-        self.db_session.commit()
+        with get_session_with_current_tenant_if_none(db_session) as db_session:
+            # Save metadata to database
+            upsert_filerecord(
+                file_id=file_id,
+                display_name=display_name or file_id,
+                file_origin=file_origin,
+                file_type=file_type,
+                bucket_name=bucket_name,
+                object_key=s3_key,
+                db_session=db_session,
+                file_metadata=file_metadata,
+            )
+            db_session.commit()
 
         return file_id
 
     def read_file(
-        self, file_id: str, mode: str | None = None, use_tempfile: bool = False
+        self,
+        file_id: str,
+        mode: str | None = None,
+        use_tempfile: bool = False,
+        db_session: Session | None = None,
     ) -> IO[bytes]:
-        file_record = get_filerecord_by_file_id(
-            file_id=file_id, db_session=self.db_session
-        )
+        with get_session_with_current_tenant_if_none(db_session) as db_session:
+            file_record = get_filerecord_by_file_id(
+                file_id=file_id, db_session=db_session
+            )
 
         s3_client = self._get_s3_client()
         try:
@@ -356,32 +384,107 @@ def read_file(
         else:
             return BytesIO(file_content)
 
-    def read_file_record(self, file_id: str) -> FileStoreModel:
-        file_record = get_filerecord_by_file_id(
-            file_id=file_id, db_session=self.db_session
-        )
-        return file_record
-
-    def delete_file(self, file_id: str) -> None:
-        try:
+    def read_file_record(
+        self, file_id: str, db_session: Session | None = None
+    ) -> FileStoreModel:
+        with get_session_with_current_tenant_if_none(db_session) as db_session:
             file_record = get_filerecord_by_file_id(
-                file_id=file_id, db_session=self.db_session
+                file_id=file_id, db_session=db_session
             )
+        return file_record
 
-            # Delete from external storage
-            s3_client = self._get_s3_client()
-            s3_client.delete_object(
-                Bucket=file_record.bucket_name, Key=file_record.object_key
-            )
+    def delete_file(self, file_id: str, db_session: Session | None = None) -> None:
+        with get_session_with_current_tenant_if_none(db_session) as db_session:
+            try:
 
-            # Delete metadata from database
-            delete_filerecord_by_file_id(file_id=file_id, db_session=self.db_session)
+                file_record = get_filerecord_by_file_id(
+                    file_id=file_id, db_session=db_session
+                )
+                if not file_record.bucket_name:
+                    logger.error(
+                        f"File record {file_id} with key {file_record.object_key} "
+                        "has no bucket name, cannot delete from filestore"
+                    )
+                    delete_filerecord_by_file_id(file_id=file_id, db_session=db_session)
+                    db_session.commit()
+                    return
+
+                # Delete from external storage
+                s3_client = self._get_s3_client()
+                s3_client.delete_object(
+                    Bucket=file_record.bucket_name, Key=file_record.object_key
+                )
 
-            self.db_session.commit()
+                # Delete metadata from database
+                delete_filerecord_by_file_id(file_id=file_id, db_session=db_session)
 
-        except Exception:
-            self.db_session.rollback()
-            raise
+                db_session.commit()
+
+            except Exception:
+                db_session.rollback()
+                raise
+
+    def change_file_id(
+        self, old_file_id: str, new_file_id: str, db_session: Session | None = None
+    ) -> None:
+        with get_session_with_current_tenant_if_none(db_session) as db_session:
+            try:
+                # Get the existing file record
+                old_file_record = get_filerecord_by_file_id(
+                    file_id=old_file_id, db_session=db_session
+                )
+
+                # Generate new S3 key for the new file ID
+                new_s3_key = self._get_s3_key(new_file_id)
+
+                # Copy S3 object to new key
+                s3_client = self._get_s3_client()
+                bucket_name = self._get_bucket_name()
+
+                copy_source = (
+                    f"{old_file_record.bucket_name}/{old_file_record.object_key}"
+                )
+
+                s3_client.copy_object(
+                    CopySource=copy_source,
+                    Bucket=bucket_name,
+                    Key=new_s3_key,
+                    MetadataDirective="COPY",
+                )
+
+                # Create new file record with new file_id
+                # Cast file_metadata to the expected type
+                file_metadata = cast(
+                    dict[Any, Any] | None, old_file_record.file_metadata
+                )
+
+                upsert_filerecord(
+                    file_id=new_file_id,
+                    display_name=old_file_record.display_name,
+                    file_origin=old_file_record.file_origin,
+                    file_type=old_file_record.file_type,
+                    bucket_name=bucket_name,
+                    object_key=new_s3_key,
+                    db_session=db_session,
+                    file_metadata=file_metadata,
+                )
+
+                # Delete old S3 object
+                s3_client.delete_object(
+                    Bucket=old_file_record.bucket_name, Key=old_file_record.object_key
+                )
+
+                # Delete old file record
+                delete_filerecord_by_file_id(file_id=old_file_id, db_session=db_session)
+
+                db_session.commit()
+
+            except Exception as e:
+                db_session.rollback()
+                logger.exception(
+                    f"Failed to change file ID from {old_file_id} to {new_file_id}: {e}"
+                )
+                raise
 
     def get_file_with_mime_type(self, filename: str) -> FileWithMimeType | None:
         mime_type: str = "application/octet-stream"
@@ -395,8 +498,18 @@ def get_file_with_mime_type(self, filename: str) -> FileWithMimeType | None:
         except Exception:
             return None
 
+    def list_files_by_prefix(self, prefix: str) -> list[FileRecord]:
+        """
+        List all file IDs that start with the given prefix.
+        """
+        with get_session_with_current_tenant() as db_session:
+            file_records = get_filerecord_by_prefix(
+                prefix=prefix, db_session=db_session
+            )
+        return file_records
+
 
-def get_s3_file_store(db_session: Session) -> S3BackedFileStore:
+def get_s3_file_store() -> S3BackedFileStore:
     """
     Returns the S3 file store implementation.
     """
@@ -409,7 +522,6 @@ def get_s3_file_store(db_session: Session) -> S3BackedFileStore:
         )
 
     return S3BackedFileStore(
-        db_session=db_session,
         bucket_name=bucket_name,
         aws_access_key_id=S3_AWS_ACCESS_KEY_ID,
         aws_secret_access_key=S3_AWS_SECRET_ACCESS_KEY,
@@ -420,7 +532,7 @@ def get_s3_file_store(db_session: Session) -> S3BackedFileStore:
     )
 
 
-def get_default_file_store(db_session: Session) -> FileStore:
+def get_default_file_store() -> FileStore:
     """
     Returns the configured file store implementation.
 
@@ -445,4 +557,4 @@ def get_default_file_store(db_session: Session) -> FileStore:
     Other S3-compatible storage (Digital Ocean, Linode, etc.):
     - Same as MinIO, but set appropriate S3_ENDPOINT_URL
     """
-    return get_s3_file_store(db_session)
+    return get_s3_file_store()
diff --git a/backend/onyx/file_store/utils.py b/backend/onyx/file_store/utils.py
index fbffb024065..f4a041bf1a8 100644
--- a/backend/onyx/file_store/utils.py
+++ b/backend/onyx/file_store/utils.py
@@ -8,7 +8,6 @@
 from sqlalchemy.orm import Session
 
 from onyx.configs.constants import FileOrigin
-from onyx.db.engine.sql_engine import get_session_with_current_tenant
 from onyx.db.models import ChatMessage
 from onyx.db.models import UserFile
 from onyx.db.models import UserFolder
@@ -49,28 +48,23 @@ def store_user_file_plaintext(user_file_id: int, plaintext_content: str) -> bool
 
     # Use a separate session to avoid committing the caller's transaction
     try:
-        with get_session_with_current_tenant() as file_store_session:
-            file_store = get_default_file_store(file_store_session)
-            file_content = BytesIO(plaintext_content.encode("utf-8"))
-            file_store.save_file(
-                content=file_content,
-                display_name=f"Plaintext for user file {user_file_id}",
-                file_origin=FileOrigin.PLAINTEXT_CACHE,
-                file_type="text/plain",
-                file_id=plaintext_file_name,
-            )
-            return True
+        file_store = get_default_file_store()
+        file_content = BytesIO(plaintext_content.encode("utf-8"))
+        file_store.save_file(
+            content=file_content,
+            display_name=f"Plaintext for user file {user_file_id}",
+            file_origin=FileOrigin.PLAINTEXT_CACHE,
+            file_type="text/plain",
+            file_id=plaintext_file_name,
+        )
+        return True
     except Exception as e:
         logger.warning(f"Failed to store plaintext for user file {user_file_id}: {e}")
         return False
 
 
-def load_chat_file(
-    file_descriptor: FileDescriptor, db_session: Session
-) -> InMemoryChatFile:
-    file_io = get_default_file_store(db_session).read_file(
-        file_descriptor["id"], mode="b"
-    )
+def load_chat_file(file_descriptor: FileDescriptor) -> InMemoryChatFile:
+    file_io = get_default_file_store().read_file(file_descriptor["id"], mode="b")
     return InMemoryChatFile(
         file_id=file_descriptor["id"],
         content=file_io.read(),
@@ -82,7 +76,6 @@ def load_chat_file(
 def load_all_chat_files(
     chat_messages: list[ChatMessage],
     file_descriptors: list[FileDescriptor],
-    db_session: Session,
 ) -> list[InMemoryChatFile]:
     file_descriptors_for_history: list[FileDescriptor] = []
     for chat_message in chat_messages:
@@ -93,7 +86,7 @@ def load_all_chat_files(
         list[InMemoryChatFile],
         run_functions_tuples_in_parallel(
             [
-                (load_chat_file, (file, db_session))
+                (load_chat_file, (file,))
                 for file in file_descriptors + file_descriptors_for_history
             ]
         ),
@@ -117,7 +110,7 @@ def load_user_file(file_id: int, db_session: Session) -> InMemoryChatFile:
         raise ValueError(f"User file with id {file_id} not found")
 
     # Get the file record to determine the appropriate chat file type
-    file_store = get_default_file_store(db_session)
+    file_store = get_default_file_store()
     file_record = file_store.read_file_record(user_file.file_id)
 
     # Determine appropriate chat file type based on the original file's MIME type
@@ -263,34 +256,29 @@ def get_user_files_as_user(
 
 
 def save_file_from_url(url: str) -> str:
-    """NOTE: using multiple sessions here, since this is often called
-    using multithreading. In practice, sharing a session has resulted in
-    weird errors."""
-    with get_session_with_current_tenant() as db_session:
-        response = requests.get(url)
-        response.raise_for_status()
-
-        file_io = BytesIO(response.content)
-        file_store = get_default_file_store(db_session)
-        file_id = file_store.save_file(
-            content=file_io,
-            display_name="GeneratedImage",
-            file_origin=FileOrigin.CHAT_IMAGE_GEN,
-            file_type="image/png;base64",
-        )
-        return file_id
+    response = requests.get(url)
+    response.raise_for_status()
+
+    file_io = BytesIO(response.content)
+    file_store = get_default_file_store()
+    file_id = file_store.save_file(
+        content=file_io,
+        display_name="GeneratedImage",
+        file_origin=FileOrigin.CHAT_IMAGE_GEN,
+        file_type="image/png;base64",
+    )
+    return file_id
 
 
 def save_file_from_base64(base64_string: str) -> str:
-    with get_session_with_current_tenant() as db_session:
-        file_store = get_default_file_store(db_session)
-        file_id = file_store.save_file(
-            content=BytesIO(base64.b64decode(base64_string)),
-            display_name="GeneratedImage",
-            file_origin=FileOrigin.CHAT_IMAGE_GEN,
-            file_type=get_image_type(base64_string),
-        )
-        return file_id
+    file_store = get_default_file_store()
+    file_id = file_store.save_file(
+        content=BytesIO(base64.b64decode(base64_string)),
+        display_name="GeneratedImage",
+        file_origin=FileOrigin.CHAT_IMAGE_GEN,
+        file_type=get_image_type(base64_string),
+    )
+    return file_id
 
 
 def save_file(
diff --git a/backend/onyx/httpx/httpx_pool.py b/backend/onyx/httpx/httpx_pool.py
index d6fe881e3c5..bbd3e9fe4f1 100644
--- a/backend/onyx/httpx/httpx_pool.py
+++ b/backend/onyx/httpx/httpx_pool.py
@@ -4,6 +4,13 @@
 import httpx
 
 
+def make_default_kwargs() -> dict[str, Any]:
+    return {
+        "http2": True,
+        "limits": httpx.Limits(),
+    }
+
+
 class HttpxPool:
     """Class to manage a global httpx Client instance"""
 
@@ -11,10 +18,6 @@ class HttpxPool:
     _lock: threading.Lock = threading.Lock()
 
     # Default parameters for creation
-    DEFAULT_KWARGS = {
-        "http2": True,
-        "limits": lambda: httpx.Limits(),
-    }
 
     def __init__(self) -> None:
         pass
@@ -22,7 +25,7 @@ def __init__(self) -> None:
     @classmethod
     def _init_client(cls, **kwargs: Any) -> httpx.Client:
         """Private helper method to create and return an httpx.Client."""
-        merged_kwargs = {**cls.DEFAULT_KWARGS, **kwargs}
+        merged_kwargs = {**(make_default_kwargs()), **kwargs}
         return httpx.Client(**merged_kwargs)
 
     @classmethod
diff --git a/backend/onyx/indexing/embedder.py b/backend/onyx/indexing/embedder.py
index 3d5f663dd4d..cfa67dc6a5f 100644
--- a/backend/onyx/indexing/embedder.py
+++ b/backend/onyx/indexing/embedder.py
@@ -4,6 +4,7 @@
 from collections import defaultdict
 
 from onyx.connectors.models import ConnectorFailure
+from onyx.connectors.models import ConnectorStopSignal
 from onyx.connectors.models import DocumentFailure
 from onyx.db.models import SearchSettings
 from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
@@ -261,6 +262,11 @@ def embed_chunks_with_failure_handling(
             ),
             [],
         )
+    except ConnectorStopSignal as e:
+        logger.warning(
+            "Connector stop signal detected in embed_chunks_with_failure_handling"
+        )
+        raise e
     except Exception:
         logger.exception("Failed to embed chunk batch. Trying individual docs.")
         # wait a couple seconds to let any rate limits or temporary issues resolve
diff --git a/backend/onyx/indexing/indexing_pipeline.py b/backend/onyx/indexing/indexing_pipeline.py
index d0da60df193..d83f1477981 100644
--- a/backend/onyx/indexing/indexing_pipeline.py
+++ b/backend/onyx/indexing/indexing_pipeline.py
@@ -1,6 +1,5 @@
 from collections import defaultdict
 from collections.abc import Callable
-from functools import partial
 from typing import Protocol
 
 from pydantic import BaseModel
@@ -23,6 +22,7 @@
     get_experts_stores_representations,
 )
 from onyx.connectors.models import ConnectorFailure
+from onyx.connectors.models import ConnectorStopSignal
 from onyx.connectors.models import Document
 from onyx.connectors.models import DocumentFailure
 from onyx.connectors.models import ImageSection
@@ -41,7 +41,6 @@
 from onyx.db.document import upsert_document_by_connector_credential_pair
 from onyx.db.document import upsert_documents
 from onyx.db.document_set import fetch_document_sets_for_documents
-from onyx.db.engine.sql_engine import get_session_with_current_tenant
 from onyx.db.models import Document as DBDocument
 from onyx.db.models import IndexModelStatus
 from onyx.db.search_settings import get_active_search_settings
@@ -62,7 +61,6 @@
 from onyx.indexing.chunker import Chunker
 from onyx.indexing.embedder import embed_chunks_with_failure_handling
 from onyx.indexing.embedder import IndexingEmbedder
-from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
 from onyx.indexing.models import DocAwareChunk
 from onyx.indexing.models import DocMetadataAwareIndexChunk
 from onyx.indexing.models import IndexChunk
@@ -290,6 +288,10 @@ def index_doc_batch_with_handler(
             enable_contextual_rag=enable_contextual_rag,
             llm=llm,
         )
+
+    except ConnectorStopSignal as e:
+        logger.warning("Connector stop signal detected in index_doc_batch_with_handler")
+        raise e
     except Exception as e:
         # don't log the batch directly, it's too much text
         document_ids = [doc.id for doc in document_batch]
@@ -496,36 +498,33 @@ def process_image_sections(documents: list[Document]) -> list[IndexingDocument]:
 
                 # Try to get image summary
                 try:
-                    with get_session_with_current_tenant() as db_session:
-                        file_store = get_default_file_store(db_session)
+                    file_store = get_default_file_store()
 
-                        file_record = file_store.read_file_record(
+                    file_record = file_store.read_file_record(
+                        file_id=section.image_file_id
+                    )
+                    if not file_record:
+                        logger.warning(
+                            f"Image file {section.image_file_id} not found in FileStore"
+                        )
+
+                        processed_section.text = "[Image could not be processed]"
+                    else:
+                        # Get the image data
+                        image_data_io = file_store.read_file(
                             file_id=section.image_file_id
                         )
-                        if not file_record:
-                            logger.warning(
-                                f"Image file {section.image_file_id} not found in FileStore"
-                            )
+                        image_data = image_data_io.read()
+                        summary = summarize_image_with_error_handling(
+                            llm=llm,
+                            image_data=image_data,
+                            context_name=file_record.display_name or "Image",
+                        )
 
-                            processed_section.text = "[Image could not be processed]"
+                        if summary:
+                            processed_section.text = summary
                         else:
-                            # Get the image data
-                            image_data_io = file_store.read_file(
-                                file_id=section.image_file_id
-                            )
-                            image_data = image_data_io.read()
-                            summary = summarize_image_with_error_handling(
-                                llm=llm,
-                                image_data=image_data,
-                                context_name=file_record.display_name or "Image",
-                            )
-
-                            if summary:
-                                processed_section.text = summary
-                            else:
-                                processed_section.text = (
-                                    "[Image could not be summarized]"
-                                )
+                            processed_section.text = "[Image could not be summarized]"
                 except Exception as e:
                     logger.error(f"Error processing image section: {e}")
                     processed_section.text = "[Error processing image]"
@@ -832,7 +831,7 @@ def index_doc_batch(
             )
         )
 
-        doc_id_to_previous_chunk_cnt: dict[str, int | None] = {
+        doc_id_to_previous_chunk_cnt: dict[str, int] = {
             document_id: chunk_count
             for document_id, chunk_count in fetch_chunk_counts_for_documents(
                 document_ids=updatable_ids,
@@ -1029,7 +1028,7 @@ def index_doc_batch(
         db_session.commit()
 
     result = IndexingPipelineResult(
-        new_docs=len([r for r in insertion_records if r.already_existed is False]),
+        new_docs=len([r for r in insertion_records if not r.already_existed]),
         total_docs=len(filtered_documents),
         total_chunks=len(access_aware_chunks),
         failures=vector_db_write_failures + embedding_failures,
@@ -1038,8 +1037,10 @@ def index_doc_batch(
     return result
 
 
-def build_indexing_pipeline(
+def run_indexing_pipeline(
     *,
+    document_batch: list[Document],
+    index_attempt_metadata: IndexAttemptMetadata,
     embedder: IndexingEmbedder,
     information_content_classification_model: InformationContentClassificationModel,
     document_index: DocumentIndex,
@@ -1047,8 +1048,7 @@ def build_indexing_pipeline(
     tenant_id: str,
     chunker: Chunker | None = None,
     ignore_time_skip: bool = False,
-    callback: IndexingHeartbeatInterface | None = None,
-) -> IndexingPipelineProtocol:
+) -> IndexingPipelineResult:
     """Builds a pipeline which takes in a list (batch) of docs and indexes them."""
     all_search_settings = get_active_search_settings(db_session)
     if (
@@ -1078,15 +1078,15 @@ def build_indexing_pipeline(
         enable_large_chunks=multipass_config.enable_large_chunks,
         enable_contextual_rag=enable_contextual_rag,
         # after every doc, update status in case there are a bunch of really long docs
-        callback=callback,
     )
 
-    return partial(
-        index_doc_batch_with_handler,
+    return index_doc_batch_with_handler(
         chunker=chunker,
         embedder=embedder,
         information_content_classification_model=information_content_classification_model,
         document_index=document_index,
+        document_batch=document_batch,
+        index_attempt_metadata=index_attempt_metadata,
         ignore_time_skip=ignore_time_skip,
         db_session=db_session,
         tenant_id=tenant_id,
diff --git a/backend/onyx/main.py b/backend/onyx/main.py
index 94297001e85..cb5196c03db 100644
--- a/backend/onyx/main.py
+++ b/backend/onyx/main.py
@@ -259,7 +259,7 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
             setup_onyx(db_session, POSTGRES_DEFAULT_SCHEMA)
             # set up the file store (e.g. create bucket if needed). On multi-tenant,
             # this is done via IaC
-            get_default_file_store(db_session).initialize()
+            get_default_file_store().initialize()
     else:
         setup_multitenant_onyx()
 
diff --git a/backend/onyx/natural_language_processing/search_nlp_models.py b/backend/onyx/natural_language_processing/search_nlp_models.py
index 4f43194e779..c69bc34097e 100644
--- a/backend/onyx/natural_language_processing/search_nlp_models.py
+++ b/backend/onyx/natural_language_processing/search_nlp_models.py
@@ -22,6 +22,7 @@
     BATCH_SIZE_ENCODE_CHUNKS_FOR_API_EMBEDDING_SERVICES,
 )
 from onyx.configs.model_configs import DOC_EMBEDDING_CONTEXT_SIZE
+from onyx.connectors.models import ConnectorStopSignal
 from onyx.db.models import SearchSettings
 from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
 from onyx.natural_language_processing.exceptions import (
@@ -198,7 +199,9 @@ def process_batch(
         ) -> tuple[int, list[Embedding]]:
             if self.callback:
                 if self.callback.should_stop():
-                    raise RuntimeError("_batch_encode_texts detected stop signal")
+                    raise ConnectorStopSignal(
+                        "_batch_encode_texts detected stop signal"
+                    )
 
             embed_request = EmbedRequest(
                 model_name=self.model_name,
diff --git a/backend/onyx/redis/redis_connector.py b/backend/onyx/redis/redis_connector.py
index d8e4854c615..393e410f06b 100644
--- a/backend/onyx/redis/redis_connector.py
+++ b/backend/onyx/redis/redis_connector.py
@@ -1,8 +1,5 @@
-import time
-
 import redis
 
-from onyx.db.models import SearchSettings
 from onyx.redis.redis_connector_delete import RedisConnectorDelete
 from onyx.redis.redis_connector_doc_perm_sync import RedisConnectorPermissionSync
 from onyx.redis.redis_connector_ext_group_sync import RedisConnectorExternalGroupSync
@@ -12,68 +9,33 @@
 from onyx.redis.redis_pool import get_redis_client
 
 
+# TODO: reduce dependence on redis
 class RedisConnector:
     """Composes several classes to simplify interacting with a connector and its
     associated background tasks / associated redis interactions."""
 
-    def __init__(self, tenant_id: str, id: int) -> None:
+    def __init__(self, tenant_id: str, cc_pair_id: int) -> None:
         """id: a connector credential pair id"""
 
         self.tenant_id: str = tenant_id
-        self.id: int = id
+        self.cc_pair_id: int = cc_pair_id
         self.redis: redis.Redis = get_redis_client(tenant_id=tenant_id)
 
-        self.stop = RedisConnectorStop(tenant_id, id, self.redis)
-        self.prune = RedisConnectorPrune(tenant_id, id, self.redis)
-        self.delete = RedisConnectorDelete(tenant_id, id, self.redis)
-        self.permissions = RedisConnectorPermissionSync(tenant_id, id, self.redis)
+        self.stop = RedisConnectorStop(tenant_id, cc_pair_id, self.redis)
+        self.prune = RedisConnectorPrune(tenant_id, cc_pair_id, self.redis)
+        self.delete = RedisConnectorDelete(tenant_id, cc_pair_id, self.redis)
+        self.permissions = RedisConnectorPermissionSync(
+            tenant_id, cc_pair_id, self.redis
+        )
         self.external_group_sync = RedisConnectorExternalGroupSync(
-            tenant_id, id, self.redis
+            tenant_id, cc_pair_id, self.redis
         )
 
     def new_index(self, search_settings_id: int) -> RedisConnectorIndex:
         return RedisConnectorIndex(
-            self.tenant_id, self.id, search_settings_id, self.redis
+            self.tenant_id, self.cc_pair_id, search_settings_id, self.redis
         )
 
-    def wait_for_indexing_termination(
-        self,
-        search_settings_list: list[SearchSettings],
-        timeout: float = 15.0,
-    ) -> bool:
-        """
-        Returns True if all indexing for the given redis connector is finished within the given timeout.
-        Returns False if the timeout is exceeded
-
-        This check does not guarantee that current indexings being terminated
-        won't get restarted midflight
-        """
-
-        finished = False
-
-        start = time.monotonic()
-
-        while True:
-            still_indexing = False
-            for search_settings in search_settings_list:
-                redis_connector_index = self.new_index(search_settings.id)
-                if redis_connector_index.fenced:
-                    still_indexing = True
-                    break
-
-            if not still_indexing:
-                finished = True
-                break
-
-            now = time.monotonic()
-            if now - start > timeout:
-                break
-
-            time.sleep(1)
-            continue
-
-        return finished
-
     @staticmethod
     def get_id_from_fence_key(key: str) -> str | None:
         """
diff --git a/backend/onyx/redis/redis_connector_delete.py b/backend/onyx/redis/redis_connector_delete.py
index 70be982192f..3ad08f24106 100644
--- a/backend/onyx/redis/redis_connector_delete.py
+++ b/backend/onyx/redis/redis_connector_delete.py
@@ -58,10 +58,7 @@ def get_remaining(self) -> int:
 
     @property
     def fenced(self) -> bool:
-        if self.redis.exists(self.fence_key):
-            return True
-
-        return False
+        return bool(self.redis.exists(self.fence_key))
 
     @property
     def payload(self) -> RedisConnectorDeletePayload | None:
@@ -93,10 +90,7 @@ def set_active(self) -> None:
         self.redis.set(self.active_key, 0, ex=self.ACTIVE_TTL)
 
     def active(self) -> bool:
-        if self.redis.exists(self.active_key):
-            return True
-
-        return False
+        return bool(self.redis.exists(self.active_key))
 
     def _generate_task_id(self) -> str:
         # celery's default task id format is "dd32ded3-00aa-4884-8b21-42f8332e7fac"
diff --git a/backend/onyx/redis/redis_connector_doc_perm_sync.py b/backend/onyx/redis/redis_connector_doc_perm_sync.py
index 2f5bf632955..07bdf67959a 100644
--- a/backend/onyx/redis/redis_connector_doc_perm_sync.py
+++ b/backend/onyx/redis/redis_connector_doc_perm_sync.py
@@ -88,10 +88,7 @@ def get_active_task_count(self) -> int:
 
     @property
     def fenced(self) -> bool:
-        if self.redis.exists(self.fence_key):
-            return True
-
-        return False
+        return bool(self.redis.exists(self.fence_key))
 
     @property
     def payload(self) -> RedisConnectorPermissionSyncPayload | None:
@@ -128,10 +125,7 @@ def set_active(self) -> None:
         self.redis.set(self.active_key, 0, ex=self.ACTIVE_TTL)
 
     def active(self) -> bool:
-        if self.redis.exists(self.active_key):
-            return True
-
-        return False
+        return bool(self.redis.exists(self.active_key))
 
     @property
     def generator_complete(self) -> int | None:
diff --git a/backend/onyx/redis/redis_connector_ext_group_sync.py b/backend/onyx/redis/redis_connector_ext_group_sync.py
index 7cc0f2d20b7..6446e4c4fa8 100644
--- a/backend/onyx/redis/redis_connector_ext_group_sync.py
+++ b/backend/onyx/redis/redis_connector_ext_group_sync.py
@@ -84,10 +84,7 @@ def get_active_task_count(self) -> int:
 
     @property
     def fenced(self) -> bool:
-        if self.redis.exists(self.fence_key):
-            return True
-
-        return False
+        return bool(self.redis.exists(self.fence_key))
 
     @property
     def payload(self) -> RedisConnectorExternalGroupSyncPayload | None:
@@ -125,10 +122,7 @@ def set_active(self) -> None:
         self.redis.set(self.active_key, 0, ex=self.ACTIVE_TTL)
 
     def active(self) -> bool:
-        if self.redis.exists(self.active_key):
-            return True
-
-        return False
+        return bool(self.redis.exists(self.active_key))
 
     @property
     def generator_complete(self) -> int | None:
diff --git a/backend/onyx/redis/redis_connector_index.py b/backend/onyx/redis/redis_connector_index.py
index 0c0687f8b35..d4a28b45ebb 100644
--- a/backend/onyx/redis/redis_connector_index.py
+++ b/backend/onyx/redis/redis_connector_index.py
@@ -1,13 +1,10 @@
 from datetime import datetime
-from typing import Any
 from typing import cast
-from uuid import uuid4
 
 import redis
 from pydantic import BaseModel
 
 from onyx.configs.constants import CELERY_INDEXING_WATCHDOG_CONNECTOR_TIMEOUT
-from onyx.configs.constants import OnyxRedisConstants
 
 
 class RedisConnectorIndexPayload(BaseModel):
@@ -31,7 +28,10 @@ class RedisConnectorIndex:
         PREFIX + "_generator_complete"
     )  # connectorindexing_generator_complete
 
-    GENERATOR_LOCK_PREFIX = "da_lock:indexing"
+    GENERATOR_LOCK_PREFIX = "da_lock:indexing:docfetching"
+    FILESTORE_LOCK_PREFIX = "da_lock:indexing:filestore"
+    DB_LOCK_PREFIX = "da_lock:indexing:db"
+    PER_WORKER_LOCK_PREFIX = "da_lock:indexing:per_worker"
 
     TERMINATE_PREFIX = PREFIX + "_terminate"  # connectorindexing_terminate
     TERMINATE_TTL = 600
@@ -53,130 +53,34 @@ class RedisConnectorIndex:
     def __init__(
         self,
         tenant_id: str,
-        id: int,
+        cc_pair_id: int,
         search_settings_id: int,
         redis: redis.Redis,
     ) -> None:
         self.tenant_id: str = tenant_id
-        self.id = id
+        self.cc_pair_id = cc_pair_id
         self.search_settings_id = search_settings_id
         self.redis = redis
 
-        self.fence_key: str = f"{self.FENCE_PREFIX}_{id}/{search_settings_id}"
-        self.generator_progress_key = (
-            f"{self.GENERATOR_PROGRESS_PREFIX}_{id}/{search_settings_id}"
-        )
         self.generator_complete_key = (
-            f"{self.GENERATOR_COMPLETE_PREFIX}_{id}/{search_settings_id}"
+            f"{self.GENERATOR_COMPLETE_PREFIX}_{cc_pair_id}/{search_settings_id}"
+        )
+        self.filestore_lock_key = (
+            f"{self.FILESTORE_LOCK_PREFIX}_{cc_pair_id}/{search_settings_id}"
         )
         self.generator_lock_key = (
-            f"{self.GENERATOR_LOCK_PREFIX}_{id}/{search_settings_id}"
+            f"{self.GENERATOR_LOCK_PREFIX}_{cc_pair_id}/{search_settings_id}"
         )
-        self.terminate_key = f"{self.TERMINATE_PREFIX}_{id}/{search_settings_id}"
-        self.watchdog_key = f"{self.WATCHDOG_PREFIX}_{id}/{search_settings_id}"
-
-        self.active_key = f"{self.ACTIVE_PREFIX}_{id}/{search_settings_id}"
-        self.connector_active_key = (
-            f"{self.CONNECTOR_ACTIVE_PREFIX}_{id}/{search_settings_id}"
+        self.per_worker_lock_key = (
+            f"{self.PER_WORKER_LOCK_PREFIX}_{cc_pair_id}/{search_settings_id}"
         )
-
-    @classmethod
-    def fence_key_with_ids(cls, cc_pair_id: int, search_settings_id: int) -> str:
-        return f"{cls.FENCE_PREFIX}_{cc_pair_id}/{search_settings_id}"
-
-    def generate_generator_task_id(self) -> str:
-        # celery's default task id format is "dd32ded3-00aa-4884-8b21-42f8332e7fac"
-        # we prefix the task id so it's easier to keep track of who created the task
-        # aka "connectorindexing+generator_1_6dd32ded3-00aa-4884-8b21-42f8332e7fac"
-
-        return f"{self.GENERATOR_TASK_PREFIX}_{self.id}/{self.search_settings_id}_{uuid4()}"
-
-    @property
-    def fenced(self) -> bool:
-        return bool(self.redis.exists(self.fence_key))
-
-    @property
-    def payload(self) -> RedisConnectorIndexPayload | None:
-        # read related data and evaluate/print task progress
-        fence_bytes = cast(Any, self.redis.get(self.fence_key))
-        if fence_bytes is None:
-            return None
-
-        fence_str = fence_bytes.decode("utf-8")
-        return RedisConnectorIndexPayload.model_validate_json(cast(str, fence_str))
-
-    def set_fence(
-        self,
-        payload: RedisConnectorIndexPayload | None,
-    ) -> None:
-        if not payload:
-            self.redis.srem(OnyxRedisConstants.ACTIVE_FENCES, self.fence_key)
-            self.redis.delete(self.fence_key)
-            return
-
-        self.redis.set(self.fence_key, payload.model_dump_json())
-        self.redis.sadd(OnyxRedisConstants.ACTIVE_FENCES, self.fence_key)
-
-    def terminating(self, celery_task_id: str) -> bool:
-        return bool(self.redis.exists(f"{self.terminate_key}_{celery_task_id}"))
-
-    def set_terminate(self, celery_task_id: str) -> None:
-        """This sets a signal. It does not block!"""
-        # We shouldn't need very long to terminate the spawned task.
-        # 10 minute TTL is good.
-        self.redis.set(
-            f"{self.terminate_key}_{celery_task_id}", 0, ex=self.TERMINATE_TTL
+        self.db_lock_key = f"{self.DB_LOCK_PREFIX}_{cc_pair_id}/{search_settings_id}"
+        self.terminate_key = (
+            f"{self.TERMINATE_PREFIX}_{cc_pair_id}/{search_settings_id}"
         )
 
-    def set_watchdog(self, value: bool) -> None:
-        """Signal the state of the watchdog."""
-        if not value:
-            self.redis.delete(self.watchdog_key)
-            return
-
-        self.redis.set(self.watchdog_key, 0, ex=self.WATCHDOG_TTL)
-
-    def watchdog_signaled(self) -> bool:
-        """Check the state of the watchdog."""
-        return bool(self.redis.exists(self.watchdog_key))
-
-    def set_active(self) -> None:
-        """This sets a signal to keep the indexing flow from getting cleaned up within
-        the expiration time.
-
-        The slack in timing is needed to avoid race conditions where simply checking
-        the celery queue and task status could result in race conditions."""
-        self.redis.set(self.active_key, 0, ex=self.ACTIVE_TTL)
-
-    def active(self) -> bool:
-        return bool(self.redis.exists(self.active_key))
-
-    def set_connector_active(self) -> None:
-        """This sets a signal to keep the indexing flow from getting cleaned up within
-        the expiration time.
-
-        The slack in timing is needed to avoid race conditions where simply checking
-        the celery queue and task status could result in race conditions."""
-        self.redis.set(self.connector_active_key, 0, ex=self.CONNECTOR_ACTIVE_TTL)
-
-    def connector_active(self) -> bool:
-        if self.redis.exists(self.connector_active_key):
-            return True
-
-        return False
-
-    def connector_active_ttl(self) -> int:
-        """Refer to https://redis.io/docs/latest/commands/ttl/
-
-        -2 means the key does not exist
-        -1 means the key exists but has no associated expire
-        Otherwise, returns the actual TTL of the key
-        """
-        ttl = cast(int, self.redis.ttl(self.connector_active_key))
-        return ttl
-
-    def generator_locked(self) -> bool:
-        return bool(self.redis.exists(self.generator_lock_key))
+    def lock_key_by_batch(self, batch_n: int) -> str:
+        return f"{self.per_worker_lock_key}/{batch_n}"
 
     def set_generator_complete(self, payload: int | None) -> None:
         if not payload:
@@ -186,21 +90,9 @@ def set_generator_complete(self, payload: int | None) -> None:
         self.redis.set(self.generator_complete_key, payload)
 
     def generator_clear(self) -> None:
-        self.redis.delete(self.generator_progress_key)
         self.redis.delete(self.generator_complete_key)
 
-    def get_progress(self) -> int | None:
-        """Returns None if the key doesn't exist. The"""
-        # TODO: move into fence?
-        bytes = self.redis.get(self.generator_progress_key)
-        if bytes is None:
-            return None
-
-        progress = int(cast(int, bytes))
-        return progress
-
     def get_completion(self) -> int | None:
-        # TODO: move into fence?
         bytes = self.redis.get(self.generator_complete_key)
         if bytes is None:
             return None
@@ -209,24 +101,22 @@ def get_completion(self) -> int | None:
         return status
 
     def reset(self) -> None:
-        self.redis.srem(OnyxRedisConstants.ACTIVE_FENCES, self.fence_key)
-        self.redis.delete(self.connector_active_key)
-        self.redis.delete(self.active_key)
+        self.redis.delete(self.filestore_lock_key)
+        self.redis.delete(self.db_lock_key)
         self.redis.delete(self.generator_lock_key)
-        self.redis.delete(self.generator_progress_key)
         self.redis.delete(self.generator_complete_key)
-        self.redis.delete(self.fence_key)
 
     @staticmethod
     def reset_all(r: redis.Redis) -> None:
         """Deletes all redis values for all connectors"""
+        # leaving these temporarily for backwards compat, TODO: remove
         for key in r.scan_iter(RedisConnectorIndex.CONNECTOR_ACTIVE_PREFIX + "*"):
             r.delete(key)
 
         for key in r.scan_iter(RedisConnectorIndex.ACTIVE_PREFIX + "*"):
             r.delete(key)
 
-        for key in r.scan_iter(RedisConnectorIndex.GENERATOR_LOCK_PREFIX + "*"):
+        for key in r.scan_iter(RedisConnectorIndex.FILESTORE_LOCK_PREFIX + "*"):
             r.delete(key)
 
         for key in r.scan_iter(RedisConnectorIndex.GENERATOR_COMPLETE_PREFIX + "*"):
diff --git a/backend/onyx/redis/redis_connector_prune.py b/backend/onyx/redis/redis_connector_prune.py
index e36c1ce587a..ff3adcfb71e 100644
--- a/backend/onyx/redis/redis_connector_prune.py
+++ b/backend/onyx/redis/redis_connector_prune.py
@@ -92,10 +92,7 @@ def get_active_task_count(self) -> int:
 
     @property
     def fenced(self) -> bool:
-        if self.redis.exists(self.fence_key):
-            return True
-
-        return False
+        return bool(self.redis.exists(self.fence_key))
 
     @property
     def payload(self) -> RedisConnectorPrunePayload | None:
@@ -130,10 +127,7 @@ def set_active(self) -> None:
         self.redis.set(self.active_key, 0, ex=self.ACTIVE_TTL)
 
     def active(self) -> bool:
-        if self.redis.exists(self.active_key):
-            return True
-
-        return False
+        return bool(self.redis.exists(self.active_key))
 
     @property
     def generator_complete(self) -> int | None:
diff --git a/backend/onyx/redis/redis_connector_stop.py b/backend/onyx/redis/redis_connector_stop.py
index 5dc1e7364e5..4061dbdfc9a 100644
--- a/backend/onyx/redis/redis_connector_stop.py
+++ b/backend/onyx/redis/redis_connector_stop.py
@@ -23,10 +23,7 @@ def __init__(self, tenant_id: str, id: int, redis: redis.Redis) -> None:
 
     @property
     def fenced(self) -> bool:
-        if self.redis.exists(self.fence_key):
-            return True
-
-        return False
+        return bool(self.redis.exists(self.fence_key))
 
     def set_fence(self, value: bool) -> None:
         if not value:
@@ -37,10 +34,7 @@ def set_fence(self, value: bool) -> None:
 
     @property
     def timed_out(self) -> bool:
-        if self.redis.exists(self.timeout_key):
-            return False
-
-        return True
+        return not bool(self.redis.exists(self.timeout_key))
 
     def set_timeout(self) -> None:
         """After calling this, call timed_out to determine if the timeout has been
diff --git a/backend/onyx/server/documents/cc_pair.py b/backend/onyx/server/documents/cc_pair.py
index 91bce050b77..1e908545e52 100644
--- a/backend/onyx/server/documents/cc_pair.py
+++ b/backend/onyx/server/documents/cc_pair.py
@@ -6,6 +6,7 @@
 from fastapi import HTTPException
 from fastapi import Query
 from fastapi.responses import JSONResponse
+from sqlalchemy import select
 from sqlalchemy.exc import IntegrityError
 from sqlalchemy.orm import Session
 
@@ -34,15 +35,15 @@
 from onyx.db.engine.sql_engine import get_session
 from onyx.db.enums import AccessType
 from onyx.db.enums import ConnectorCredentialPairStatus
+from onyx.db.enums import IndexingStatus
 from onyx.db.index_attempt import count_index_attempt_errors_for_cc_pair
 from onyx.db.index_attempt import count_index_attempts_for_connector
 from onyx.db.index_attempt import get_index_attempt_errors_for_cc_pair
 from onyx.db.index_attempt import get_latest_index_attempt_for_cc_pair_id
 from onyx.db.index_attempt import get_paginated_index_attempts_for_cc_pair_id
-from onyx.db.models import SearchSettings
+from onyx.db.indexing_coordination import IndexingCoordination
+from onyx.db.models import IndexAttempt
 from onyx.db.models import User
-from onyx.db.search_settings import get_active_search_settings_list
-from onyx.db.search_settings import get_current_search_settings
 from onyx.redis.redis_connector import RedisConnector
 from onyx.redis.redis_connector_utils import get_deletion_attempt_snapshot
 from onyx.redis.redis_pool import get_redis_client
@@ -139,11 +140,6 @@ def get_cc_pair_full_info(
         only_finished=False,
     )
 
-    search_settings = get_current_search_settings(db_session)
-
-    redis_connector = RedisConnector(tenant_id, cc_pair_id)
-    redis_connector_index = redis_connector.new_index(search_settings.id)
-
     return CCPairFullInfo.from_models(
         cc_pair_model=cc_pair,
         number_of_index_attempts=count_index_attempts_for_connector(
@@ -159,7 +155,9 @@ def get_cc_pair_full_info(
         ),
         num_docs_indexed=documents_indexed,
         is_editable_for_current_user=is_editable_for_current_user,
-        indexing=redis_connector_index.fenced,
+        indexing=bool(
+            latest_attempt and latest_attempt.status == IndexingStatus.IN_PROGRESS
+        ),
     )
 
 
@@ -195,31 +193,35 @@ def update_cc_pair_status(
     if status_update_request.status == ConnectorCredentialPairStatus.PAUSED:
         redis_connector.stop.set_fence(True)
 
-        search_settings_list: list[SearchSettings] = get_active_search_settings_list(
-            db_session
+        # Request cancellation for any active indexing attempts for this cc_pair
+        active_attempts = (
+            db_session.execute(
+                select(IndexAttempt).where(
+                    IndexAttempt.connector_credential_pair_id == cc_pair_id,
+                    IndexAttempt.status.in_(
+                        [IndexingStatus.NOT_STARTED, IndexingStatus.IN_PROGRESS]
+                    ),
+                )
+            )
+            .scalars()
+            .all()
         )
 
-        while True:
-            for search_settings in search_settings_list:
-                redis_connector_index = redis_connector.new_index(search_settings.id)
-                if not redis_connector_index.fenced:
-                    continue
-
-                index_payload = redis_connector_index.payload
-                if not index_payload:
-                    continue
-
-                if not index_payload.celery_task_id:
-                    continue
-
+        for attempt in active_attempts:
+            try:
+                IndexingCoordination.request_cancellation(db_session, attempt.id)
                 # Revoke the task to prevent it from running
-                client_app.control.revoke(index_payload.celery_task_id)
-
-                # If it is running, then signaling for termination will get the
-                # watchdog thread to kill the spawned task
-                redis_connector_index.set_terminate(index_payload.celery_task_id)
+                if attempt.celery_task_id:
+                    client_app.control.revoke(attempt.celery_task_id)
+                logger.info(
+                    f"Requested cancellation for active indexing attempt {attempt.id} "
+                    f"due to connector pause: cc_pair={cc_pair_id}"
+                )
+            except Exception:
+                logger.exception(
+                    f"Failed to request cancellation for indexing attempt {attempt.id}"
+                )
 
-            break
     else:
         redis_connector.stop.set_fence(False)
 
diff --git a/backend/onyx/server/documents/connector.py b/backend/onyx/server/documents/connector.py
index 6ed16f8ba57..281cadc65c9 100644
--- a/backend/onyx/server/documents/connector.py
+++ b/backend/onyx/server/documents/connector.py
@@ -106,7 +106,6 @@
 from onyx.file_processing.extract_file_text import convert_docx_to_txt
 from onyx.file_store.file_store import get_default_file_store
 from onyx.key_value_store.interface import KvKeyNotFoundError
-from onyx.redis.redis_connector import RedisConnector
 from onyx.server.documents.models import AuthStatus
 from onyx.server.documents.models import AuthUrl
 from onyx.server.documents.models import ConnectorCredentialPairIdentifier
@@ -421,7 +420,7 @@ def extract_zip_metadata(zf: zipfile.ZipFile) -> dict[str, Any]:
     return zip_metadata
 
 
-def upload_files(files: list[UploadFile], db_session: Session) -> FileUploadResponse:
+def upload_files(files: list[UploadFile]) -> FileUploadResponse:
     for file in files:
         if not file.filename:
             raise HTTPException(status_code=400, detail="File name cannot be empty")
@@ -434,7 +433,7 @@ def should_process_file(file_path: str) -> bool:
     deduped_file_paths = []
     zip_metadata = {}
     try:
-        file_store = get_default_file_store(db_session)
+        file_store = get_default_file_store()
         seen_zip = False
         for file in files:
             if file.content_type and file.content_type.startswith("application/zip"):
@@ -491,9 +490,8 @@ def should_process_file(file_path: str) -> bool:
 def upload_files_api(
     files: list[UploadFile],
     _: User = Depends(current_curator_or_admin_user),
-    db_session: Session = Depends(get_session),
 ) -> FileUploadResponse:
-    return upload_files(files, db_session)
+    return upload_files(files)
 
 
 @router.get("/admin/connector")
@@ -775,7 +773,7 @@ def get_connector_indexing_status(
         if secondary_index
         else get_current_search_settings
     )
-    search_settings = get_search_settings(db_session)
+    get_search_settings(db_session)
     for cc_pair in cc_pairs:
         # TODO remove this to enable ingestion API
         if cc_pair.name == "DefaultCCPair":
@@ -787,16 +785,13 @@ def get_connector_indexing_status(
             # This may happen if background deletion is happening
             continue
 
-        in_progress = False
-        if search_settings:
-            redis_connector = RedisConnector(tenant_id, cc_pair.id)
-            redis_connector_index = redis_connector.new_index(search_settings.id)
-            if redis_connector_index.fenced:
-                in_progress = True
-
         latest_index_attempt = cc_pair_to_latest_index_attempt.get(
             (connector.id, credential.id)
         )
+        in_progress = bool(
+            latest_index_attempt
+            and latest_index_attempt.status == IndexingStatus.IN_PROGRESS
+        )
 
         latest_finished_attempt = cc_pair_to_latest_finished_index_attempt.get(
             (connector.id, credential.id)
diff --git a/backend/onyx/server/features/persona/api.py b/backend/onyx/server/features/persona/api.py
index 7bdaac56796..bc34032ba18 100644
--- a/backend/onyx/server/features/persona/api.py
+++ b/backend/onyx/server/features/persona/api.py
@@ -195,10 +195,9 @@ def undelete_persona(
 @admin_router.post("/upload-image")
 def upload_file(
     file: UploadFile,
-    db_session: Session = Depends(get_session),
     _: User | None = Depends(current_user),
 ) -> dict[str, str]:
-    file_store = get_default_file_store(db_session)
+    file_store = get_default_file_store()
     file_type = ChatFileType.IMAGE
     file_id = file_store.save_file(
         content=file.file,
diff --git a/backend/onyx/server/manage/administrative.py b/backend/onyx/server/manage/administrative.py
index 99666916c00..43b80e93a85 100644
--- a/backend/onyx/server/manage/administrative.py
+++ b/backend/onyx/server/manage/administrative.py
@@ -205,6 +205,6 @@ def create_deletion_attempt_for_connector_id(
 
     if cc_pair.connector.source == DocumentSource.FILE:
         connector = cc_pair.connector
-        file_store = get_default_file_store(db_session)
+        file_store = get_default_file_store()
         for file_name in connector.connector_specific_config.get("file_locations", []):
             file_store.delete_file(file_name)
diff --git a/backend/onyx/server/onyx_api/ingestion.py b/backend/onyx/server/onyx_api/ingestion.py
index f275fe565f9..175fadf5178 100644
--- a/backend/onyx/server/onyx_api/ingestion.py
+++ b/backend/onyx/server/onyx_api/ingestion.py
@@ -21,7 +21,7 @@
 from onyx.db.search_settings import get_secondary_search_settings
 from onyx.document_index.factory import get_default_document_index
 from onyx.indexing.embedder import DefaultIndexingEmbedder
-from onyx.indexing.indexing_pipeline import build_indexing_pipeline
+from onyx.indexing.indexing_pipeline import run_indexing_pipeline
 from onyx.natural_language_processing.search_nlp_models import (
     InformationContentClassificationModel,
 )
@@ -113,16 +113,13 @@ def upsert_ingestion_doc(
 
     information_content_classification_model = InformationContentClassificationModel()
 
-    indexing_pipeline = build_indexing_pipeline(
+    indexing_pipeline_result = run_indexing_pipeline(
         embedder=index_embedding_model,
         information_content_classification_model=information_content_classification_model,
         document_index=curr_doc_index,
         ignore_time_skip=True,
         db_session=db_session,
         tenant_id=tenant_id,
-    )
-
-    indexing_pipeline_result = indexing_pipeline(
         document_batch=[document],
         index_attempt_metadata=IndexAttemptMetadata(
             connector_id=cc_pair.connector_id,
@@ -148,16 +145,13 @@ def upsert_ingestion_doc(
             active_search_settings.secondary, None
         )
 
-        sec_ind_pipeline = build_indexing_pipeline(
+        run_indexing_pipeline(
             embedder=new_index_embedding_model,
             information_content_classification_model=information_content_classification_model,
             document_index=sec_doc_index,
             ignore_time_skip=True,
             db_session=db_session,
             tenant_id=tenant_id,
-        )
-
-        sec_ind_pipeline(
             document_batch=[document],
             index_attempt_metadata=IndexAttemptMetadata(
                 connector_id=cc_pair.connector_id,
diff --git a/backend/onyx/server/query_and_chat/chat_backend.py b/backend/onyx/server/query_and_chat/chat_backend.py
index 9c36a01974b..db924d09dd0 100644
--- a/backend/onyx/server/query_and_chat/chat_backend.py
+++ b/backend/onyx/server/query_and_chat/chat_backend.py
@@ -720,7 +720,7 @@ def upload_files_for_chat(
                 detail="File size must be less than 20MB",
             )
 
-    file_store = get_default_file_store(db_session)
+    file_store = get_default_file_store()
 
     file_info: list[tuple[str, str | None, ChatFileType]] = []
     for file in files:
@@ -823,10 +823,9 @@ def upload_files_for_chat(
 @router.get("/file/{file_id:path}")
 def fetch_chat_file(
     file_id: str,
-    db_session: Session = Depends(get_session),
     _: User | None = Depends(current_user),
 ) -> Response:
-    file_store = get_default_file_store(db_session)
+    file_store = get_default_file_store()
     file_record = file_store.read_file_record(file_id)
     if not file_record:
         raise HTTPException(status_code=404, detail="File not found")
diff --git a/backend/onyx/server/runtime/onyx_runtime.py b/backend/onyx/server/runtime/onyx_runtime.py
index e502dfd7d12..5e624749dc8 100644
--- a/backend/onyx/server/runtime/onyx_runtime.py
+++ b/backend/onyx/server/runtime/onyx_runtime.py
@@ -11,7 +11,6 @@
 from onyx.configs.constants import ONYX_CLOUD_REDIS_RUNTIME
 from onyx.configs.constants import ONYX_CLOUD_TENANT_ID
 from onyx.configs.constants import ONYX_EMAILABLE_LOGO_MAX_DIM
-from onyx.db.engine.sql_engine import get_session_with_shared_schema
 from onyx.file_store.file_store import get_default_file_store
 from onyx.redis.redis_pool import get_redis_replica_client
 from onyx.utils.file import FileWithMimeType
@@ -40,9 +39,8 @@ def _get_with_static_fallback(
         onyx_file: FileWithMimeType | None = None
 
         if db_filename:
-            with get_session_with_shared_schema() as db_session:
-                file_store = get_default_file_store(db_session)
-                onyx_file = file_store.get_file_with_mime_type(db_filename)
+            file_store = get_default_file_store()
+            onyx_file = file_store.get_file_with_mime_type(db_filename)
 
         if not onyx_file:
             onyx_file = OnyxStaticFileManager.get_static(static_filename)
diff --git a/backend/onyx/tools/tool_implementations/custom/custom_tool.py b/backend/onyx/tools/tool_implementations/custom/custom_tool.py
index 519d14ceafd..e4445b81cd2 100644
--- a/backend/onyx/tools/tool_implementations/custom/custom_tool.py
+++ b/backend/onyx/tools/tool_implementations/custom/custom_tool.py
@@ -17,7 +17,6 @@
 
 from onyx.chat.prompt_builder.answer_prompt_builder import AnswerPromptBuilder
 from onyx.configs.constants import FileOrigin
-from onyx.db.engine.sql_engine import get_session_with_current_tenant
 from onyx.file_store.file_store import get_default_file_store
 from onyx.file_store.models import ChatFileType
 from onyx.file_store.models import InMemoryChatFile
@@ -205,27 +204,26 @@ def get_args_for_non_tool_calling_llm(
     def _save_and_get_file_references(
         self, file_content: bytes | str, content_type: str
     ) -> List[str]:
-        with get_session_with_current_tenant() as db_session:
-            file_store = get_default_file_store(db_session)
-
-            file_id = str(uuid.uuid4())
-
-            # Handle both binary and text content
-            if isinstance(file_content, str):
-                content = BytesIO(file_content.encode())
-            else:
-                content = BytesIO(file_content)
-
-            file_store.save_file(
-                file_id=file_id,
-                content=content,
-                display_name=file_id,
-                file_origin=FileOrigin.CHAT_UPLOAD,
-                file_type=content_type,
-                file_metadata={
-                    "content_type": content_type,
-                },
-            )
+        file_store = get_default_file_store()
+
+        file_id = str(uuid.uuid4())
+
+        # Handle both binary and text content
+        if isinstance(file_content, str):
+            content = BytesIO(file_content.encode())
+        else:
+            content = BytesIO(file_content)
+
+        file_store.save_file(
+            file_id=file_id,
+            content=content,
+            display_name=file_id,
+            file_origin=FileOrigin.CHAT_UPLOAD,
+            file_type=content_type,
+            file_metadata={
+                "content_type": content_type,
+            },
+        )
 
         return [file_id]
 
@@ -328,22 +326,21 @@ def build_next_prompt(
 
         # Load files from storage
         files = []
-        with get_session_with_current_tenant() as db_session:
-            file_store = get_default_file_store(db_session)
-
-            for file_id in response.tool_result.file_ids:
-                try:
-                    file_io = file_store.read_file(file_id, mode="b")
-                    files.append(
-                        InMemoryChatFile(
-                            file_id=file_id,
-                            filename=file_id,
-                            content=file_io.read(),
-                            file_type=file_type,
-                        )
+        file_store = get_default_file_store()
+
+        for file_id in response.tool_result.file_ids:
+            try:
+                file_io = file_store.read_file(file_id, mode="b")
+                files.append(
+                    InMemoryChatFile(
+                        file_id=file_id,
+                        filename=file_id,
+                        content=file_io.read(),
+                        file_type=file_type,
                     )
-                except Exception:
-                    logger.exception(f"Failed to read file {file_id}")
+                )
+            except Exception:
+                logger.exception(f"Failed to read file {file_id}")
 
             # Update prompt with file content
             prompt_builder.update_user_prompt(
diff --git a/backend/onyx/utils/logger.py b/backend/onyx/utils/logger.py
index 99f00263a01..8eecf36c2ca 100644
--- a/backend/onyx/utils/logger.py
+++ b/backend/onyx/utils/logger.py
@@ -207,6 +207,7 @@ def setup_logger(
     name: str = __name__,
     log_level: int = get_log_level_from_str(),
     extra: MutableMapping[str, Any] | None = None,
+    propagate: bool = True,
 ) -> OnyxLoggingAdapter:
     logger = logging.getLogger(name)
 
@@ -244,6 +245,12 @@ def setup_logger(
 
     logger.notice = lambda msg, *args, **kwargs: logger.log(logging.getLevelName("NOTICE"), msg, *args, **kwargs)  # type: ignore
 
+    # After handler configuration, disable propagation to avoid duplicate logs
+    # Prevent messages from propagating to the root logger which can cause
+    # duplicate log entries when the root logger is also configured with its
+    # own handler (e.g. by Uvicorn / Celery).
+    logger.propagate = propagate
+
     return OnyxLoggingAdapter(logger, extra=extra)
 
 
diff --git a/backend/onyx/utils/threadpool_concurrency.py b/backend/onyx/utils/threadpool_concurrency.py
index 0271bc64059..e72eba47b45 100644
--- a/backend/onyx/utils/threadpool_concurrency.py
+++ b/backend/onyx/utils/threadpool_concurrency.py
@@ -384,3 +384,24 @@ def parallel_yield(gens: list[Iterator[R]], max_workers: int = 10) -> Iterator[R
                     )
                     next_ind += 1
                 del future_to_index[future]
+
+
+def parallel_yield_from_funcs(
+    funcs: list[Callable[..., R]],
+    max_workers: int = 10,
+) -> Iterator[R]:
+    """
+    Runs the list of functions with thread-level parallelism, yielding
+    results as available. The asynchronous nature of this yielding means
+    that stopping the returned iterator early DOES NOT GUARANTEE THAT NO
+    FURTHER ITEMS WERE PRODUCED by the input funcs. Only use this function
+    if you are consuming all elements from the functions OR it is acceptable
+    for some extra function code to run and not have the result(s) yielded.
+    """
+
+    def func_wrapper(func: Callable[[], R]) -> Iterator[R]:
+        yield func()
+
+    yield from parallel_yield(
+        [func_wrapper(func) for func in funcs], max_workers=max_workers
+    )
diff --git a/backend/scripts/dev_run_background_jobs.py b/backend/scripts/dev_run_background_jobs.py
index 3c8253e8a6d..12764ef0c5d 100644
--- a/backend/scripts/dev_run_background_jobs.py
+++ b/backend/scripts/dev_run_background_jobs.py
@@ -59,23 +59,23 @@ def run_jobs() -> None:
         "connector_pruning,connector_doc_permissions_sync,connector_external_group_sync,csv_generation",
     ]
 
-    cmd_worker_indexing = [
+    cmd_worker_docprocessing = [
         "celery",
         "-A",
-        "onyx.background.celery.versioned_apps.indexing",
+        "onyx.background.celery.versioned_apps.docprocessing",
         "worker",
         "--pool=threads",
-        "--concurrency=1",
+        "--concurrency=6",
         "--prefetch-multiplier=1",
         "--loglevel=INFO",
-        "--hostname=indexing@%n",
-        "--queues=connector_indexing",
+        "--hostname=docprocessing@%n",
+        "--queues=docprocessing",
     ]
 
     cmd_worker_user_files_indexing = [
         "celery",
         "-A",
-        "onyx.background.celery.versioned_apps.indexing",
+        "onyx.background.celery.versioned_apps.docfetching",
         "worker",
         "--pool=threads",
         "--concurrency=1",
@@ -111,6 +111,19 @@ def run_jobs() -> None:
         "--queues=kg_processing",
     ]
 
+    cmd_worker_docfetching = [
+        "celery",
+        "-A",
+        "onyx.background.celery.versioned_apps.docfetching",
+        "worker",
+        "--pool=threads",
+        "--concurrency=1",
+        "--prefetch-multiplier=1",
+        "--loglevel=INFO",
+        "--hostname=docfetching@%n",
+        "--queues=connector_doc_fetching,user_files_indexing",
+    ]
+
     cmd_beat = [
         "celery",
         "-A",
@@ -132,8 +145,11 @@ def run_jobs() -> None:
         cmd_worker_heavy, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
     )
 
-    worker_indexing_process = subprocess.Popen(
-        cmd_worker_indexing, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
+    worker_docprocessing_process = subprocess.Popen(
+        cmd_worker_docprocessing,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
     )
 
     worker_user_files_indexing_process = subprocess.Popen(
@@ -157,6 +173,13 @@ def run_jobs() -> None:
         text=True,
     )
 
+    worker_docfetching_process = subprocess.Popen(
+        cmd_worker_docfetching,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+    )
+
     beat_process = subprocess.Popen(
         cmd_beat, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
     )
@@ -171,8 +194,8 @@ def run_jobs() -> None:
     worker_heavy_thread = threading.Thread(
         target=monitor_process, args=("HEAVY", worker_heavy_process)
     )
-    worker_indexing_thread = threading.Thread(
-        target=monitor_process, args=("INDEX", worker_indexing_process)
+    worker_docprocessing_thread = threading.Thread(
+        target=monitor_process, args=("DOCPROCESSING", worker_docprocessing_process)
     )
     worker_user_files_indexing_thread = threading.Thread(
         target=monitor_process,
@@ -184,24 +207,29 @@ def run_jobs() -> None:
     worker_kg_processing_thread = threading.Thread(
         target=monitor_process, args=("KG_PROCESSING", worker_kg_processing_process)
     )
+    worker_docfetching_thread = threading.Thread(
+        target=monitor_process, args=("DOCFETCHING", worker_docfetching_process)
+    )
     beat_thread = threading.Thread(target=monitor_process, args=("BEAT", beat_process))
 
     worker_primary_thread.start()
     worker_light_thread.start()
     worker_heavy_thread.start()
-    worker_indexing_thread.start()
+    worker_docprocessing_thread.start()
     worker_user_files_indexing_thread.start()
     worker_monitoring_thread.start()
     worker_kg_processing_thread.start()
+    worker_docfetching_thread.start()
     beat_thread.start()
 
     worker_primary_thread.join()
     worker_light_thread.join()
     worker_heavy_thread.join()
-    worker_indexing_thread.join()
+    worker_docprocessing_thread.join()
     worker_user_files_indexing_thread.join()
     worker_monitoring_thread.join()
     worker_kg_processing_thread.join()
+    worker_docfetching_thread.join()
     beat_thread.join()
 
 
diff --git a/backend/scripts/force_delete_connector_by_id.py b/backend/scripts/force_delete_connector_by_id.py
index 7c4e6bf17fb..d9e563561ad 100755
--- a/backend/scripts/force_delete_connector_by_id.py
+++ b/backend/scripts/force_delete_connector_by_id.py
@@ -213,7 +213,7 @@ def _delete_connector(cc_pair_id: int, db_session: Session) -> None:
 
     if file_names:
         logger.notice("Deleting stored files!")
-        file_store = get_default_file_store(db_session)
+        file_store = get_default_file_store()
         for file_name in file_names:
             logger.notice(f"Deleting file {file_name}")
             file_store.delete_file(file_name)
diff --git a/backend/supervisord.conf b/backend/supervisord.conf
index 1fd472b949c..1bc2ed0041a 100644
--- a/backend/supervisord.conf
+++ b/backend/supervisord.conf
@@ -65,12 +65,12 @@ autorestart=true
 startsecs=10
 stopasgroup=true
 
-[program:celery_worker_indexing]
-command=celery -A onyx.background.celery.versioned_apps.indexing worker
+[program:celery_worker_docprocessing]
+command=celery -A onyx.background.celery.versioned_apps.docprocessing worker
     --loglevel=INFO
-    --hostname=indexing@%%n
-    -Q connector_indexing
-stdout_logfile=/var/log/celery_worker_indexing.log
+    --hostname=docprocessing@%%n
+    -Q docprocessing
+stdout_logfile=/var/log/celery_worker_docprocessing.log
 stdout_logfile_maxbytes=16MB
 redirect_stderr=true
 autorestart=true
@@ -78,7 +78,7 @@ startsecs=10
 stopasgroup=true
 
 [program:celery_worker_user_files_indexing]
-command=celery -A onyx.background.celery.versioned_apps.indexing worker
+command=celery -A onyx.background.celery.versioned_apps.docfetching worker
     --loglevel=INFO
     --hostname=user_files_indexing@%%n
     -Q user_files_indexing
@@ -89,6 +89,18 @@ autorestart=true
 startsecs=10
 stopasgroup=true
 
+[program:celery_worker_docfetching]
+command=celery -A onyx.background.celery.versioned_apps.docfetching worker
+    --loglevel=INFO
+    --hostname=docfetching@%%n
+    -Q connector_doc_fetching
+stdout_logfile=/var/log/celery_worker_docfetching.log
+stdout_logfile_maxbytes=16MB
+redirect_stderr=true
+autorestart=true
+startsecs=10
+stopasgroup=true
+
 [program:celery_worker_monitoring]
 command=celery -A onyx.background.celery.versioned_apps.monitoring worker
     --loglevel=INFO
@@ -161,6 +173,7 @@ command=tail -qF
     /var/log/celery_worker_heavy.log
     /var/log/celery_worker_indexing.log
     /var/log/celery_worker_user_files_indexing.log
+    /var/log/celery_worker_docfetching.log
     /var/log/celery_worker_monitoring.log
     /var/log/slack_bot.log
     /var/log/supervisord_watchdog_celery_beat.log
diff --git a/backend/tests/daily/connectors/file/test_file_connector.py b/backend/tests/daily/connectors/file/test_file_connector.py
index 036fc855989..a94e90fa3ca 100644
--- a/backend/tests/daily/connectors/file/test_file_connector.py
+++ b/backend/tests/daily/connectors/file/test_file_connector.py
@@ -30,14 +30,12 @@ def mock_filestore_record() -> MagicMock:
 
 
 @patch("onyx.connectors.file.connector.get_default_file_store")
-@patch("onyx.connectors.file.connector.get_session_with_current_tenant")
 @patch(
     "onyx.file_processing.extract_file_text.get_unstructured_api_key", return_value=None
 )
 def test_single_text_file_with_metadata(
     mock_get_unstructured_api_key: MagicMock,
     mock_get_session: MagicMock,
-    mock_get_filestore: MagicMock,
     mock_db_session: MagicMock,
     mock_file_store: MagicMock,
     mock_filestore_record: MagicMock,
@@ -48,13 +46,18 @@ def test_single_text_file_with_metadata(
           "doc_updated_at": "2001-01-01T00:00:00Z"}\n'
         b"Test answer is 12345"
     )
+    mock_get_filestore = MagicMock()
     mock_get_filestore.return_value = mock_file_store
     mock_file_store.read_file_record.return_value = mock_filestore_record
     mock_get_session.return_value.__enter__.return_value = mock_db_session
     mock_file_store.read_file.return_value = file_content
 
-    connector = LocalFileConnector(file_locations=["test.txt"], zip_metadata={})
-    batches = list(connector.load_from_state())
+    with patch(
+        "onyx.connectors.file.connector.get_default_file_store",
+        return_value=mock_file_store,
+    ):
+        connector = LocalFileConnector(file_locations=["test.txt"], zip_metadata={})
+        batches = list(connector.load_from_state())
 
     assert len(batches) == 1
     docs = batches[0]
@@ -69,26 +72,22 @@ def test_single_text_file_with_metadata(
     assert doc.doc_updated_at == datetime(2001, 1, 1, 0, 0, 0, tzinfo=timezone.utc)
 
 
-@patch("onyx.connectors.file.connector.get_default_file_store")
-@patch("onyx.connectors.file.connector.get_session_with_current_tenant")
 @patch(
     "onyx.file_processing.extract_file_text.get_unstructured_api_key", return_value=None
 )
 def test_two_text_files_with_zip_metadata(
     mock_get_unstructured_api_key: MagicMock,
-    mock_get_session: MagicMock,
-    mock_get_filestore: MagicMock,
     mock_db_session: MagicMock,
     mock_file_store: MagicMock,
 ) -> None:
     file1_content = io.BytesIO(b"File 1 content")
     file2_content = io.BytesIO(b"File 2 content")
+    mock_get_filestore = MagicMock()
     mock_get_filestore.return_value = mock_file_store
     mock_file_store.read_file_record.side_effect = [
         MagicMock(file_id=str(uuid4()), display_name="file1.txt"),
         MagicMock(file_id=str(uuid4()), display_name="file2.txt"),
     ]
-    mock_get_session.return_value.__enter__.return_value = mock_db_session
     mock_file_store.read_file.side_effect = [file1_content, file2_content]
     zip_metadata = {
         "file1.txt": {
@@ -109,10 +108,14 @@ def test_two_text_files_with_zip_metadata(
         },
     }
 
-    connector = LocalFileConnector(
-        file_locations=["file1.txt", "file2.txt"], zip_metadata=zip_metadata
-    )
-    batches = list(connector.load_from_state())
+    with patch(
+        "onyx.connectors.file.connector.get_default_file_store",
+        return_value=mock_file_store,
+    ):
+        connector = LocalFileConnector(
+            file_locations=["file1.txt", "file2.txt"], zip_metadata=zip_metadata
+        )
+        batches = list(connector.load_from_state())
 
     assert len(batches) == 1
     docs = batches[0]
diff --git a/backend/tests/external_dependency_unit/file_store/test_file_store_non_mocked.py b/backend/tests/external_dependency_unit/file_store/test_file_store_non_mocked.py
index 0128bb6da74..7fa3a04bc15 100644
--- a/backend/tests/external_dependency_unit/file_store/test_file_store_non_mocked.py
+++ b/backend/tests/external_dependency_unit/file_store/test_file_store_non_mocked.py
@@ -70,20 +70,19 @@ def _get_all_backend_configs() -> List[BackendConfig]:
     if S3_ENDPOINT_URL:
         minio_access_key = "minioadmin"
         minio_secret_key = "minioadmin"
-        if minio_access_key and minio_secret_key:
-            configs.append(
-                {
-                    "endpoint_url": S3_ENDPOINT_URL,
-                    "access_key": minio_access_key,
-                    "secret_key": minio_secret_key,
-                    "region": "us-east-1",
-                    "verify_ssl": False,
-                    "backend_name": "MinIO",
-                }
-            )
+        configs.append(
+            {
+                "endpoint_url": S3_ENDPOINT_URL,
+                "access_key": minio_access_key,
+                "secret_key": minio_secret_key,
+                "region": "us-east-1",
+                "verify_ssl": False,
+                "backend_name": "MinIO",
+            }
+        )
 
     # AWS S3 configuration (if credentials are available)
-    if S3_AWS_ACCESS_KEY_ID and S3_AWS_SECRET_ACCESS_KEY:
+    elif S3_AWS_ACCESS_KEY_ID and S3_AWS_SECRET_ACCESS_KEY:
         configs.append(
             {
                 "endpoint_url": None,
@@ -116,7 +115,6 @@ def file_store(
 
     # Create S3BackedFileStore with backend-specific configuration
     store = S3BackedFileStore(
-        db_session=db_session,
         bucket_name=TEST_BUCKET_NAME,
         aws_access_key_id=backend_config["access_key"],
         aws_secret_access_key=backend_config["secret_key"],
@@ -827,7 +825,6 @@ def save_file_worker(worker_id: int) -> bool:
                     # Create a new database session for each worker to avoid conflicts
                     with get_session_with_current_tenant() as worker_session:
                         worker_file_store = S3BackedFileStore(
-                            db_session=worker_session,
                             bucket_name=current_bucket_name,
                             aws_access_key_id=current_access_key,
                             aws_secret_access_key=current_secret_key,
@@ -849,6 +846,7 @@ def save_file_worker(worker_id: int) -> bool:
                             display_name=f"Worker {worker_id} File",
                             file_origin=file_origin,
                             file_type=file_type,
+                            db_session=worker_session,
                         )
                         results.append((file_name, content))
                         return True
@@ -885,3 +883,94 @@ def save_file_worker(worker_id: int) -> bool:
             read_content_io = file_store.read_file(file_id)
             actual_content: str = read_content_io.read().decode("utf-8")
             assert actual_content == expected_content
+
+    def test_list_files_by_prefix(self, file_store: S3BackedFileStore) -> None:
+        """Test listing files by prefix returns only correctly prefixed files"""
+        test_prefix = "documents-batch-"
+
+        # Files that should be returned (start with the prefix)
+        prefixed_files: List[str] = [
+            f"{test_prefix}001.txt",
+            f"{test_prefix}002.json",
+            f"{test_prefix}abc.pdf",
+            f"{test_prefix}xyz-final.docx",
+        ]
+
+        # Files that should NOT be returned (don't start with prefix, even if they contain it)
+        non_prefixed_files: List[str] = [
+            f"other-{test_prefix}001.txt",  # Contains prefix but doesn't start with it
+            f"backup-{test_prefix}data.txt",  # Contains prefix but doesn't start with it
+            f"{uuid.uuid4()}.txt",  # Random file without prefix
+            "reports-001.pdf",  # Different prefix
+            f"my-{test_prefix[:-1]}.txt",  # Similar but not exact prefix
+        ]
+
+        all_files = prefixed_files + non_prefixed_files
+        saved_file_ids: List[str] = []
+
+        # Save all test files
+        for file_name in all_files:
+            content = f"Content for {file_name}"
+            content_io = BytesIO(content.encode("utf-8"))
+
+            returned_file_id = file_store.save_file(
+                content=content_io,
+                display_name=f"Display: {file_name}",
+                file_origin=FileOrigin.OTHER,
+                file_type="text/plain",
+                file_id=file_name,
+            )
+            saved_file_ids.append(returned_file_id)
+
+            # Verify file was saved
+            assert returned_file_id == file_name
+
+        # Test the list_files_by_prefix functionality
+        prefix_results = file_store.list_files_by_prefix(test_prefix)
+
+        # Extract file IDs from results
+        returned_file_ids = [record.file_id for record in prefix_results]
+
+        # Verify correct number of files returned
+        assert len(returned_file_ids) == len(prefixed_files), (
+            f"Expected {len(prefixed_files)} files with prefix '{test_prefix}', "
+            f"but got {len(returned_file_ids)}: {returned_file_ids}"
+        )
+
+        # Verify all prefixed files are returned
+        for expected_file_id in prefixed_files:
+            assert expected_file_id in returned_file_ids, (
+                f"File '{expected_file_id}' should be in results but was not found. "
+                f"Returned files: {returned_file_ids}"
+            )
+
+        # Verify no non-prefixed files are returned
+        for unexpected_file_id in non_prefixed_files:
+            assert unexpected_file_id not in returned_file_ids, (
+                f"File '{unexpected_file_id}' should NOT be in results but was found. "
+                f"Returned files: {returned_file_ids}"
+            )
+
+        # Verify the returned records have correct properties
+        for record in prefix_results:
+            assert record.file_id.startswith(test_prefix)
+            assert record.display_name == f"Display: {record.file_id}"
+            assert record.file_origin == FileOrigin.OTHER
+            assert record.file_type == "text/plain"
+            assert record.bucket_name == file_store._get_bucket_name()
+
+        # Test with empty prefix (should return all files we created)
+        all_results = file_store.list_files_by_prefix("")
+        all_returned_ids = [record.file_id for record in all_results]
+
+        # Should include all our test files
+        for file_id in saved_file_ids:
+            assert (
+                file_id in all_returned_ids
+            ), f"File '{file_id}' should be in results for empty prefix"
+
+        # Test with non-existent prefix
+        nonexistent_results = file_store.list_files_by_prefix("nonexistent-prefix-")
+        assert (
+            len(nonexistent_results) == 0
+        ), "Should return empty list for non-existent prefix"
diff --git a/backend/tests/integration/common_utils/constants.py b/backend/tests/integration/common_utils/constants.py
index 5054e4963d5..c7ca8e8c465 100644
--- a/backend/tests/integration/common_utils/constants.py
+++ b/backend/tests/integration/common_utils/constants.py
@@ -6,7 +6,7 @@
 API_SERVER_HOST = os.getenv("API_SERVER_HOST") or "127.0.0.1"
 API_SERVER_PORT = os.getenv("API_SERVER_PORT") or "8080"
 API_SERVER_URL = f"{API_SERVER_PROTOCOL}://{API_SERVER_HOST}:{API_SERVER_PORT}"
-MAX_DELAY = 60
+MAX_DELAY = 300
 
 GENERAL_HEADERS = {"Content-Type": "application/json"}
 
diff --git a/backend/tests/integration/common_utils/managers/index_attempt.py b/backend/tests/integration/common_utils/managers/index_attempt.py
index d27c32228cc..4ffb64f8c82 100644
--- a/backend/tests/integration/common_utils/managers/index_attempt.py
+++ b/backend/tests/integration/common_utils/managers/index_attempt.py
@@ -203,7 +203,9 @@ def wait_for_index_attempt_completion(
             )
 
             if index_attempt.status and index_attempt.status.is_terminal():
-                print(f"IndexAttempt {index_attempt_id} completed")
+                print(
+                    f"IndexAttempt {index_attempt_id} completed with status {index_attempt.status}"
+                )
                 return
 
             elapsed = time.monotonic() - start
@@ -216,6 +218,7 @@ def wait_for_index_attempt_completion(
                 f"Waiting for IndexAttempt {index_attempt_id} to complete. "
                 f"elapsed={elapsed:.2f} timeout={timeout}"
             )
+            time.sleep(5)
 
     @staticmethod
     def get_index_attempt_errors_for_cc_pair(
diff --git a/backend/tests/integration/common_utils/reset.py b/backend/tests/integration/common_utils/reset.py
index 9e167759c6f..c204f38e17e 100644
--- a/backend/tests/integration/common_utils/reset.py
+++ b/backend/tests/integration/common_utils/reset.py
@@ -22,6 +22,7 @@
 from onyx.document_index.document_index_utils import get_multipass_config
 from onyx.document_index.vespa.index import DOCUMENT_ID_ENDPOINT
 from onyx.document_index.vespa.index import VespaIndex
+from onyx.file_store.file_store import get_default_file_store
 from onyx.indexing.models import IndexingSetting
 from onyx.setup import setup_postgres
 from onyx.setup import setup_vespa
@@ -398,6 +399,13 @@ def reset_vespa_multitenant() -> None:
                 time.sleep(5)
 
 
+def reset_file_store() -> None:
+    """Reset the FileStore."""
+    filestore = get_default_file_store()
+    for file_record in filestore.list_files_by_prefix(""):
+        filestore.delete_file(file_record.file_id)
+
+
 def reset_all() -> None:
     if os.environ.get("SKIP_RESET", "").lower() == "true":
         logger.info("Skipping reset.")
@@ -407,6 +415,8 @@ def reset_all() -> None:
     reset_postgres()
     logger.info("Resetting Vespa...")
     reset_vespa()
+    logger.info("Resetting FileStore...")
+    reset_file_store()
 
 
 def reset_all_multitenant() -> None:
diff --git a/backend/tests/integration/tests/connector/test_connector_creation.py b/backend/tests/integration/tests/connector/test_connector_creation.py
index 41257346663..f56808794c1 100644
--- a/backend/tests/integration/tests/connector/test_connector_creation.py
+++ b/backend/tests/integration/tests/connector/test_connector_creation.py
@@ -60,7 +60,7 @@ def test_overlapping_connector_creation(reset: None) -> None:
     )
 
     CCPairManager.wait_for_indexing_completion(
-        cc_pair_1, now, timeout=120, user_performing_action=admin_user
+        cc_pair_1, now, timeout=300, user_performing_action=admin_user
     )
 
     now = datetime.now(timezone.utc)
@@ -74,7 +74,7 @@ def test_overlapping_connector_creation(reset: None) -> None:
     )
 
     CCPairManager.wait_for_indexing_completion(
-        cc_pair_2, now, timeout=120, user_performing_action=admin_user
+        cc_pair_2, now, timeout=300, user_performing_action=admin_user
     )
 
     info_1 = CCPairManager.get_single(cc_pair_1.id, user_performing_action=admin_user)
diff --git a/backend/tests/integration/tests/image_indexing/test_indexing_images.py b/backend/tests/integration/tests/image_indexing/test_indexing_images.py
index 9192df508a5..451cde5d9a4 100644
--- a/backend/tests/integration/tests/image_indexing/test_indexing_images.py
+++ b/backend/tests/integration/tests/image_indexing/test_indexing_images.py
@@ -90,7 +90,7 @@ def test_image_indexing(
     CCPairManager.wait_for_indexing_completion(
         cc_pair=cc_pair,
         after=datetime.now(timezone.utc),
-        timeout=180,
+        timeout=300,
         user_performing_action=admin_user,
     )
 
diff --git a/backend/tests/integration/tests/indexing/test_checkpointing.py b/backend/tests/integration/tests/indexing/test_checkpointing.py
index ea56bbd7635..f7c092962e8 100644
--- a/backend/tests/integration/tests/indexing/test_checkpointing.py
+++ b/backend/tests/integration/tests/indexing/test_checkpointing.py
@@ -343,8 +343,6 @@ def test_mock_connector_checkpoint_recovery(
     """Test that checkpointing works correctly when an unhandled exception occurs
     and that subsequent runs pick up from the last successful checkpoint."""
     # Create test documents
-    # Create 100 docs for first batch, this is needed to get past the
-    # `_NUM_DOCS_INDEXED_TO_BE_VALID_CHECKPOINT` logic in `get_latest_valid_checkpoint`.
     docs_batch_1 = [create_test_document() for _ in range(100)]
     doc2 = create_test_document()
     doc3 = create_test_document()
@@ -421,10 +419,13 @@ def test_mock_connector_checkpoint_recovery(
             db_session=db_session,
             vespa_client=vespa_client,
         )
-    assert len(documents) == 101  # 100 docs from first batch + doc2
-    document_ids = {doc.id for doc in documents}
-    assert doc2.id in document_ids
-    assert all(doc.id in document_ids for doc in docs_batch_1)
+    # This is no longer guaranteed because docfetching and docprocessing are decoupled!
+    # Some batches may not be processed when docfetching fails, but they should still stick around
+    # in the filestore and be ready for the next run.
+    # assert len(documents) == 101  # 100 docs from first batch + doc2
+    # document_ids = {doc.id for doc in documents}
+    # assert doc2.id in document_ids
+    # assert all(doc.id in document_ids for doc in docs_batch_1)
 
     # Get the checkpoints that were sent to the mock server
     response = mock_server_client.get("/get-checkpoints")
diff --git a/backend/tests/integration/tests/indexing/test_repeated_error_state.py b/backend/tests/integration/tests/indexing/test_repeated_error_state.py
index 70308276615..36be5fbed59 100644
--- a/backend/tests/integration/tests/indexing/test_repeated_error_state.py
+++ b/backend/tests/integration/tests/indexing/test_repeated_error_state.py
@@ -3,7 +3,7 @@
 
 import httpx
 
-from onyx.background.celery.tasks.indexing.utils import (
+from onyx.background.celery.tasks.docprocessing.utils import (
     NUM_REPEAT_ERRORS_BEFORE_REPEATED_ERROR_STATE,
 )
 from onyx.configs.constants import DocumentSource
diff --git a/backend/tests/integration/tests/pruning/test_pruning.py b/backend/tests/integration/tests/pruning/test_pruning.py
index 917dbace425..94ab4cebd1b 100644
--- a/backend/tests/integration/tests/pruning/test_pruning.py
+++ b/backend/tests/integration/tests/pruning/test_pruning.py
@@ -136,7 +136,7 @@ def test_web_pruning(reset: None, vespa_client: vespa_fixture) -> None:
             )
 
             CCPairManager.wait_for_indexing_completion(
-                cc_pair_1, now, timeout=120, user_performing_action=admin_user
+                cc_pair_1, now, timeout=300, user_performing_action=admin_user
             )
 
             selected_cc_pair = CCPairManager.get_indexing_status_by_id(
diff --git a/backend/tests/unit/file_store/test_file_store.py b/backend/tests/unit/file_store/test_file_store.py
index 3d321a9b99f..3b9982e9752 100644
--- a/backend/tests/unit/file_store/test_file_store.py
+++ b/backend/tests/unit/file_store/test_file_store.py
@@ -77,18 +77,15 @@ def sample_file_io(sample_content: bytes) -> BytesIO:
 class TestExternalStorageFileStore:
     """Test external storage file store functionality (S3-compatible)"""
 
-    def test_get_default_file_store_s3(self, db_session: Session) -> None:
+    def test_get_default_file_store_s3(self) -> None:
         """Test that external storage file store is returned"""
-        file_store = get_default_file_store(db_session)
+        file_store = get_default_file_store()
         assert isinstance(file_store, S3BackedFileStore)
 
-    def test_s3_client_initialization_with_credentials(
-        self, db_session: Session
-    ) -> None:
+    def test_s3_client_initialization_with_credentials(self) -> None:
         """Test S3 client initialization with explicit credentials"""
         with patch("boto3.client") as mock_boto3:
             file_store = S3BackedFileStore(
-                db_session,
                 bucket_name="test-bucket",
                 aws_access_key_id="test-key",
                 aws_secret_access_key="test-secret",
@@ -110,7 +107,6 @@ def test_s3_client_initialization_with_iam_role(self, db_session: Session) -> No
         """Test S3 client initialization with IAM role (no explicit credentials)"""
         with patch("boto3.client") as mock_boto3:
             file_store = S3BackedFileStore(
-                db_session,
                 bucket_name="test-bucket",
                 aws_access_key_id=None,
                 aws_secret_access_key=None,
@@ -129,16 +125,16 @@ def test_s3_client_initialization_with_iam_role(self, db_session: Session) -> No
             assert "aws_access_key_id" not in call_kwargs
             assert "aws_secret_access_key" not in call_kwargs
 
-    def test_s3_bucket_name_configuration(self, db_session: Session) -> None:
+    def test_s3_bucket_name_configuration(self) -> None:
         """Test S3 bucket name configuration"""
         with patch(
             "onyx.file_store.file_store.S3_FILE_STORE_BUCKET_NAME", "my-test-bucket"
         ):
-            file_store = S3BackedFileStore(db_session, bucket_name="my-test-bucket")
+            file_store = S3BackedFileStore(bucket_name="my-test-bucket")
             bucket_name: str = file_store._get_bucket_name()
             assert bucket_name == "my-test-bucket"
 
-    def test_s3_key_generation_default_prefix(self, db_session: Session) -> None:
+    def test_s3_key_generation_default_prefix(self) -> None:
         """Test S3 key generation with default prefix"""
         with (
             patch("onyx.file_store.file_store.S3_FILE_STORE_PREFIX", "onyx-files"),
@@ -147,11 +143,11 @@ def test_s3_key_generation_default_prefix(self, db_session: Session) -> None:
                 return_value="test-tenant",
             ),
         ):
-            file_store = S3BackedFileStore(db_session, bucket_name="test-bucket")
+            file_store = S3BackedFileStore(bucket_name="test-bucket")
             s3_key: str = file_store._get_s3_key("test-file.txt")
             assert s3_key == "onyx-files/test-tenant/test-file.txt"
 
-    def test_s3_key_generation_custom_prefix(self, db_session: Session) -> None:
+    def test_s3_key_generation_custom_prefix(self) -> None:
         """Test S3 key generation with custom prefix"""
         with (
             patch("onyx.file_store.file_store.S3_FILE_STORE_PREFIX", "custom-prefix"),
@@ -161,17 +157,15 @@ def test_s3_key_generation_custom_prefix(self, db_session: Session) -> None:
             ),
         ):
             file_store = S3BackedFileStore(
-                db_session, bucket_name="test-bucket", s3_prefix="custom-prefix"
+                bucket_name="test-bucket", s3_prefix="custom-prefix"
             )
             s3_key: str = file_store._get_s3_key("test-file.txt")
             assert s3_key == "custom-prefix/test-tenant/test-file.txt"
 
-    def test_s3_key_generation_with_different_tenant_ids(
-        self, db_session: Session
-    ) -> None:
+    def test_s3_key_generation_with_different_tenant_ids(self) -> None:
         """Test S3 key generation with different tenant IDs"""
         with patch("onyx.file_store.file_store.S3_FILE_STORE_PREFIX", "onyx-files"):
-            file_store = S3BackedFileStore(db_session, bucket_name="test-bucket")
+            file_store = S3BackedFileStore(bucket_name="test-bucket")
 
             # Test with tenant ID "tenant-1"
             with patch(
@@ -224,9 +218,7 @@ def test_s3_save_file_mock(
             with patch("onyx.db.file_record.upsert_filerecord") as mock_upsert:
                 mock_upsert.return_value = Mock()
 
-                file_store = S3BackedFileStore(
-                    mock_db_session, bucket_name="test-bucket"
-                )
+                file_store = S3BackedFileStore(bucket_name="test-bucket")
 
                 # This should not raise an exception
                 file_store.save_file(
@@ -235,6 +227,7 @@ def test_s3_save_file_mock(
                     display_name="Test File",
                     file_origin=FileOrigin.OTHER,
                     file_type="text/plain",
+                    db_session=mock_db_session,
                 )
 
                 # Verify S3 client was called correctly
@@ -244,14 +237,13 @@ def test_s3_save_file_mock(
                 assert call_args[1]["Key"] == "onyx-files/public/test-file.txt"
                 assert call_args[1]["ContentType"] == "text/plain"
 
-    def test_minio_client_initialization(self, db_session: Session) -> None:
+    def test_minio_client_initialization(self) -> None:
         """Test S3 client initialization with MinIO endpoint"""
         with (
             patch("boto3.client") as mock_boto3,
             patch("urllib3.disable_warnings"),
         ):
             file_store = S3BackedFileStore(
-                db_session,
                 bucket_name="test-bucket",
                 aws_access_key_id="minioadmin",
                 aws_secret_access_key="minioadmin",
@@ -277,11 +269,10 @@ def test_minio_client_initialization(self, db_session: Session) -> None:
             assert config.signature_version == "s3v4"
             assert config.s3["addressing_style"] == "path"
 
-    def test_minio_ssl_verification_enabled(self, db_session: Session) -> None:
+    def test_minio_ssl_verification_enabled(self) -> None:
         """Test MinIO with SSL verification enabled"""
         with patch("boto3.client") as mock_boto3:
             file_store = S3BackedFileStore(
-                db_session,
                 bucket_name="test-bucket",
                 aws_access_key_id="test-key",
                 aws_secret_access_key="test-secret",
@@ -295,11 +286,10 @@ def test_minio_ssl_verification_enabled(self, db_session: Session) -> None:
             assert "verify" not in call_kwargs or call_kwargs.get("verify") is not False
             assert call_kwargs["endpoint_url"] == "https://minio.example.com"
 
-    def test_aws_s3_without_endpoint_url(self, db_session: Session) -> None:
+    def test_aws_s3_without_endpoint_url(self) -> None:
         """Test that regular AWS S3 doesn't include endpoint URL or custom config"""
         with patch("boto3.client") as mock_boto3:
             file_store = S3BackedFileStore(
-                db_session,
                 bucket_name="test-bucket",
                 aws_access_key_id="test-key",
                 aws_secret_access_key="test-secret",
@@ -321,8 +311,8 @@ def test_aws_s3_without_endpoint_url(self, db_session: Session) -> None:
 class TestFileStoreInterface:
     """Test the general file store interface"""
 
-    def test_file_store_always_external_storage(self, db_session: Session) -> None:
+    def test_file_store_always_external_storage(self) -> None:
         """Test that external storage file store is always returned"""
         # File store should always be S3BackedFileStore regardless of environment
-        file_store = get_default_file_store(db_session)
+        file_store = get_default_file_store()
         assert isinstance(file_store, S3BackedFileStore)
diff --git a/deployment/docker_compose/docker-compose.dev.yml b/deployment/docker_compose/docker-compose.dev.yml
index 88c131984a8..d5b4571d2b9 100644
--- a/deployment/docker_compose/docker-compose.dev.yml
+++ b/deployment/docker_compose/docker-compose.dev.yml
@@ -245,7 +245,8 @@ services:
       - LINEAR_CLIENT_SECRET=${LINEAR_CLIENT_SECRET:-}
       # Celery Configs (defaults are set in the supervisord.conf file.
       # prefer doing that to have one source of defaults)
-      - CELERY_WORKER_INDEXING_CONCURRENCY=${CELERY_WORKER_INDEXING_CONCURRENCY:-}
+      - CELERY_WORKER_DOCFETCHING_CONCURRENCY=${CELERY_WORKER_DOCFETCHING_CONCURRENCY:-}
+      - CELERY_WORKER_DOCPROCESSING_CONCURRENCY=${CELERY_WORKER_DOCPROCESSING_CONCURRENCY:-}
       - CELERY_WORKER_LIGHT_CONCURRENCY=${CELERY_WORKER_LIGHT_CONCURRENCY:-}
       - CELERY_WORKER_LIGHT_PREFETCH_MULTIPLIER=${CELERY_WORKER_LIGHT_PREFETCH_MULTIPLIER:-}
 
@@ -477,6 +478,7 @@ services:
     environment:
       MINIO_ROOT_USER: ${MINIO_ROOT_USER:-minioadmin}
       MINIO_ROOT_PASSWORD: ${MINIO_ROOT_PASSWORD:-minioadmin}
+      # Note: we've seen the default bucket creation logic not work in some cases
       MINIO_DEFAULT_BUCKETS: ${S3_FILE_STORE_BUCKET_NAME:-onyx-file-store-bucket}
     volumes:
       - minio_data:/data
diff --git a/deployment/docker_compose/docker-compose.gpu-dev.yml b/deployment/docker_compose/docker-compose.gpu-dev.yml
index a2b76517370..be6d5598b9b 100644
--- a/deployment/docker_compose/docker-compose.gpu-dev.yml
+++ b/deployment/docker_compose/docker-compose.gpu-dev.yml
@@ -216,7 +216,8 @@ services:
       - LOG_VESPA_TIMING_INFORMATION=${LOG_VESPA_TIMING_INFORMATION:-}
       # Celery Configs (defaults are set in the supervisord.conf file.
       # prefer doing that to have one source of defaults)
-      - CELERY_WORKER_INDEXING_CONCURRENCY=${CELERY_WORKER_INDEXING_CONCURRENCY:-}
+      - CELERY_WORKER_DOCFETCHING_CONCURRENCY=${CELERY_WORKER_DOCFETCHING_CONCURRENCY:-}
+      - CELERY_WORKER_DOCPROCESSING_CONCURRENCY=${CELERY_WORKER_DOCPROCESSING_CONCURRENCY:-}
       - CELERY_WORKER_LIGHT_CONCURRENCY=${CELERY_WORKER_LIGHT_CONCURRENCY:-}
       - CELERY_WORKER_LIGHT_PREFETCH_MULTIPLIER=${CELERY_WORKER_LIGHT_PREFETCH_MULTIPLIER:-}
 
diff --git a/deployment/docker_compose/docker-compose.multitenant-dev.yml b/deployment/docker_compose/docker-compose.multitenant-dev.yml
index a3b7f4ee4c0..6f37e96bb81 100644
--- a/deployment/docker_compose/docker-compose.multitenant-dev.yml
+++ b/deployment/docker_compose/docker-compose.multitenant-dev.yml
@@ -227,7 +227,8 @@ services:
       - LINEAR_CLIENT_SECRET=${LINEAR_CLIENT_SECRET:-}
       # Celery Configs (defaults are set in the supervisord.conf file.
       # prefer doing that to have one source of defaults)
-      - CELERY_WORKER_INDEXING_CONCURRENCY=${CELERY_WORKER_INDEXING_CONCURRENCY:-}
+      - CELERY_WORKER_DOCFETCHING_CONCURRENCY=${CELERY_WORKER_DOCFETCHING_CONCURRENCY:-}
+      - CELERY_WORKER_DOCPROCESSING_CONCURRENCY=${CELERY_WORKER_DOCPROCESSING_CONCURRENCY:-}
       - CELERY_WORKER_LIGHT_CONCURRENCY=${CELERY_WORKER_LIGHT_CONCURRENCY:-}
       - CELERY_WORKER_LIGHT_PREFETCH_MULTIPLIER=${CELERY_WORKER_LIGHT_PREFETCH_MULTIPLIER:-}
 
diff --git a/deployment/helm/charts/onyx/templates/celery-worker-docfetching.yaml b/deployment/helm/charts/onyx/templates/celery-worker-docfetching.yaml
new file mode 100644
index 00000000000..7b46d837fc4
--- /dev/null
+++ b/deployment/helm/charts/onyx/templates/celery-worker-docfetching.yaml
@@ -0,0 +1,84 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ include "onyx-stack.fullname" . }}-celery-worker-docfetching
+  labels:
+    {{- include "onyx-stack.labels" . | nindent 4 }}
+spec:
+  {{- if not .Values.celery_worker_docfetching.autoscaling.enabled }}
+  replicas: {{ .Values.celery_worker_docfetching.replicaCount }}
+  {{- end }}
+  selector:
+    matchLabels:
+      {{- include "onyx-stack.selectorLabels" . | nindent 6 }}
+      {{- if .Values.celery_worker_docfetching.deploymentLabels }}
+      {{- toYaml .Values.celery_worker_docfetching.deploymentLabels | nindent 6 }}
+      {{- end }}
+  template:
+    metadata:
+      {{- with .Values.celery_worker_docfetching.podAnnotations }}
+      annotations:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      labels:
+        {{- include "onyx-stack.labels" . | nindent 8 }}
+        {{- with .Values.celery_worker_docfetching.podLabels }}
+        {{- toYaml . | nindent 8 }}
+        {{- end }}
+    spec:
+      {{- with .Values.imagePullSecrets }}
+      imagePullSecrets:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      serviceAccountName: {{ include "onyx-stack.serviceAccountName" . }}
+      securityContext:
+        {{- toYaml .Values.celery_worker_docfetching.podSecurityContext | nindent 8 }}
+      containers:
+        - name: celery-worker-docfetching
+          securityContext:
+            {{- toYaml .Values.celery_worker_docfetching.securityContext | nindent 12 }}
+          image: "{{ .Values.celery_shared.image.repository }}:{{ .Values.celery_shared.image.tag | default .Chart.AppVersion }}"
+          imagePullPolicy: {{ .Values.global.pullPolicy }}
+          command:
+            [
+              "celery",
+              "-A",
+              "onyx.background.celery.versioned_apps.docfetching",
+              "worker",
+              "--pool=threads",
+              "--concurrency=2",
+              "--prefetch-multiplier=1",
+              "--loglevel=INFO",
+              "--hostname=docfetching@%n",
+              "-Q",
+              "connector_doc_fetching,user_files_indexing",
+            ]
+          resources:
+            {{- toYaml .Values.celery_worker_docfetching.resources | nindent 12 }}
+          envFrom:
+            - configMapRef:
+                name: {{ .Values.config.envConfigMapName }}
+          env:
+            {{- include "onyx-stack.envSecrets" . | nindent 12}}
+          startupProbe:
+            {{ .Values.celery_shared.startupProbe | toYaml | nindent 12}}
+          readinessProbe:
+            {{ .Values.celery_shared.readinessProbe | toYaml | nindent 12}}
+            exec:
+              command:
+                - /bin/bash
+                - -c
+                - >
+                    python onyx/background/celery/celery_k8s_probe.py
+                    --probe readiness
+                    --filename /tmp/onyx_k8s_docfetching_readiness.txt
+          livenessProbe:
+            {{ .Values.celery_shared.livenessProbe | toYaml | nindent 12}}
+            exec:
+              command:
+                - /bin/bash
+                - -c
+                - >
+                    python onyx/background/celery/celery_k8s_probe.py
+                    --probe liveness
+                    --filename /tmp/onyx_k8s_docfetching_liveness.txt 
\ No newline at end of file
diff --git a/deployment/helm/charts/onyx/templates/celery-worker-indexing.yaml b/deployment/helm/charts/onyx/templates/celery-worker-docprocessing.yaml
similarity index 61%
rename from deployment/helm/charts/onyx/templates/celery-worker-indexing.yaml
rename to deployment/helm/charts/onyx/templates/celery-worker-docprocessing.yaml
index fbf7bc7440e..8ace69777d9 100644
--- a/deployment/helm/charts/onyx/templates/celery-worker-indexing.yaml
+++ b/deployment/helm/charts/onyx/templates/celery-worker-docprocessing.yaml
@@ -1,28 +1,28 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: {{ include "onyx-stack.fullname" . }}-celery-worker-indexing
+  name: {{ include "onyx-stack.fullname" . }}-celery-worker-docprocessing
   labels:
     {{- include "onyx-stack.labels" . | nindent 4 }}
 spec:
-  {{- if not .Values.celery_worker_indexing.autoscaling.enabled }}
-  replicas: {{ .Values.celery_worker_indexing.replicaCount }}
+  {{- if not .Values.celery_worker_docprocessing.autoscaling.enabled }}
+  replicas: {{ .Values.celery_worker_docprocessing.replicaCount }}
   {{- end }}
   selector:
     matchLabels:
       {{- include "onyx-stack.selectorLabels" . | nindent 6 }}
-      {{- if .Values.celery_worker_indexing.deploymentLabels }}
-      {{- toYaml .Values.celery_worker_indexing.deploymentLabels | nindent 6 }}
+      {{- if .Values.celery_worker_docprocessing.deploymentLabels }}
+      {{- toYaml .Values.celery_worker_docprocessing.deploymentLabels | nindent 6 }}
       {{- end }}
   template:
     metadata:
-      {{- with .Values.celery_worker_indexing.podAnnotations }}
+      {{- with .Values.celery_worker_docprocessing.podAnnotations }}
       annotations:
         {{- toYaml . | nindent 8 }}
       {{- end }}
       labels:
         {{- include "onyx-stack.labels" . | nindent 8 }}
-        {{- with .Values.celery_worker_indexing.podLabels }}
+        {{- with .Values.celery_worker_docprocessing.podLabels }}
         {{- toYaml . | nindent 8 }}
         {{- end }}
     spec:
@@ -32,32 +32,35 @@ spec:
       {{- end }}
       serviceAccountName: {{ include "onyx-stack.serviceAccountName" . }}
       securityContext:
-        {{- toYaml .Values.celery_worker_indexing.podSecurityContext | nindent 8 }}
+        {{- toYaml .Values.celery_worker_docprocessing.podSecurityContext | nindent 8 }}
       containers:
-        - name: celery-worker-indexing
+        - name: celery-worker-docprocessing
           securityContext:
-            {{- toYaml .Values.celery_worker_indexing.securityContext | nindent 12 }}
+            {{- toYaml .Values.celery_worker_docprocessing.securityContext | nindent 12 }}
           image: "{{ .Values.celery_shared.image.repository }}:{{ .Values.celery_shared.image.tag | default .Values.global.version }}"
           imagePullPolicy: {{ .Values.global.pullPolicy }}
           command:
             [
               "celery",
               "-A",
-              "onyx.background.celery.versioned_apps.indexing",
+              "onyx.background.celery.versioned_apps.docprocessing",
               "worker",
+              "--pool=threads",
+              "--concurrency=6",
+              "--prefetch-multiplier=1",
               "--loglevel=INFO",
-              "--hostname=indexing@%n",
+              "--hostname=docprocessing@%n",
               "-Q",
-              "connector_indexing",
+              "docprocessing",
             ]
           resources:
-            {{- toYaml .Values.celery_worker_indexing.resources | nindent 12 }}
+            {{- toYaml .Values.celery_worker_docprocessing.resources | nindent 12 }}
           envFrom:
             - configMapRef:
                 name: {{ .Values.config.envConfigMapName }}
           env:
             - name: ENABLE_MULTIPASS_INDEXING
-              value: "{{ .Values.celery_worker_indexing.enableMiniChunk }}"
+              value: "{{ .Values.celery_worker_docprocessing.enableMiniChunk }}"
             {{- include "onyx-stack.envSecrets" . | nindent 12}}
           startupProbe:
             {{ .Values.celery_shared.startupProbe | toYaml | nindent 12}}
@@ -70,7 +73,7 @@ spec:
                 - >
                     python onyx/background/celery/celery_k8s_probe.py
                     --probe readiness
-                    --filename /tmp/onyx_k8s_indexing_readiness.txt
+                    --filename /tmp/onyx_k8s_docprocessing_readiness.txt
           livenessProbe:
             {{ .Values.celery_shared.livenessProbe | toYaml | nindent 12}}
             exec:
@@ -80,4 +83,4 @@ spec:
                 - >
                     python onyx/background/celery/celery_k8s_probe.py
                     --probe liveness
-                    --filename /tmp/onyx_k8s_indexing_liveness.txt
+                    --filename /tmp/onyx_k8s_docprocessing_liveness.txt
diff --git a/deployment/helm/charts/onyx/templates/celery-worker-user-files-indexing.yaml b/deployment/helm/charts/onyx/templates/celery-worker-user-files-indexing.yaml
index 6364c376977..92b0b5ade4c 100644
--- a/deployment/helm/charts/onyx/templates/celery-worker-user-files-indexing.yaml
+++ b/deployment/helm/charts/onyx/templates/celery-worker-user-files-indexing.yaml
@@ -43,7 +43,7 @@ spec:
             [
               "celery",
               "-A",
-              "onyx.background.celery.versioned_apps.indexing",
+              "onyx.background.celery.versioned_apps.docprocessing",
               "worker",
               "--loglevel=INFO",
               "--hostname=user-files-indexing@%n",
diff --git a/deployment/helm/charts/onyx/values.yaml b/deployment/helm/charts/onyx/values.yaml
index ac138c1f36f..428c902ae93 100644
--- a/deployment/helm/charts/onyx/values.yaml
+++ b/deployment/helm/charts/onyx/values.yaml
@@ -373,16 +373,16 @@ celery_worker_heavy:
   tolerations: []
   affinity: {}
 
-celery_worker_indexing:
+celery_worker_docprocessing:
   replicaCount: 1
   autoscaling:
     enabled: false
   podAnnotations: {}
   podLabels:
     scope: onyx-backend-celery
-    app: celery-worker-indexing
+    app: celery-worker-docprocessing
   deploymentLabels:
-    app: celery-worker-indexing
+    app: celery-worker-docprocessing
   podSecurityContext:
     {}
   securityContext:
@@ -390,11 +390,11 @@ celery_worker_indexing:
     runAsUser: 0
   resources:
     requests:
-      cpu: 2000m
-      memory: 8Gi
+      cpu: 500m
+      memory: 4Gi
     limits:
-      cpu: 2000m
-      memory: 16Gi
+      cpu: 1000m
+      memory: 12Gi
   volumes: []  # Additional volumes on the output Deployment definition.
   volumeMounts: []  # Additional volumeMounts on the output Deployment definition.
   nodeSelector: {}
@@ -536,6 +536,33 @@ slackbot:
     limits:
       cpu: "1000m"
       memory: "2000Mi"
+celery_worker_docfetching:
+  replicaCount: 1
+  autoscaling:
+    enabled: false
+  podAnnotations: {}
+  podLabels:
+    scope: onyx-backend-celery
+    app: celery-worker-docfetching
+  deploymentLabels:
+    app: celery-worker-docfetching
+  podSecurityContext:
+    {}
+  securityContext:
+    privileged: true
+    runAsUser: 0
+  resources:
+    requests:
+      cpu: 500m
+      memory: 8Gi
+    limits:
+      cpu: 2000m
+      memory: 16Gi
+  volumes: []  # Additional volumes on the output Deployment definition.
+  volumeMounts: []  # Additional volumeMounts on the output Deployment definition.
+  nodeSelector: {}
+  tolerations: []
+  affinity: {}
 
 ######################################################################
 #
@@ -688,7 +715,7 @@ configMap:
   GONG_CONNECTOR_START_TIME: ""
   NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP: ""
   # Worker Parallelism
-  CELERY_WORKER_INDEXING_CONCURRENCY: ""
+  CELERY_WORKER_DOCPROCESSING_CONCURRENCY: ""
   CELERY_WORKER_LIGHT_CONCURRENCY: ""
   CELERY_WORKER_LIGHT_PREFETCH_MULTIPLIER: ""
   # OnyxBot SlackBot Configs

From 36123456a7482a74965a2eda25af193868f8e835 Mon Sep 17 00:00:00 2001
From: Chris Weaver <chris@onyx.app>
Date: Tue, 22 Jul 2025 11:15:44 -0700
Subject: [PATCH 17/78] Tiny launch.json template improvement (#5055)

---
 .vscode/launch.template.jsonc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.vscode/launch.template.jsonc b/.vscode/launch.template.jsonc
index e50bda3f5c3..166d853e304 100644
--- a/.vscode/launch.template.jsonc
+++ b/.vscode/launch.template.jsonc
@@ -24,7 +24,8 @@
           "Celery primary",
           "Celery light",
           "Celery heavy",
-          "Celery indexing",
+          "Celery docfetching",
+          "Celery docprocessing",
           "Celery user files indexing",
           "Celery beat",
           "Celery monitoring"

From b2bc730f18fb44d2e771057b23c7a224d60054b3 Mon Sep 17 00:00:00 2001
From: Raunak Bhagat <r@rabh.io>
Date: Tue, 22 Jul 2025 14:46:22 -0700
Subject: [PATCH 18/78] refactor: Update the error message that is logged when
 PR title fails Conventional Commits regex (#5062)

---
 .github/workflows/pr-labeler.yml  | 33 +++++++++++++++++++++++++++++++
 .github/workflows/pr-labeller.yml | 20 -------------------
 2 files changed, 33 insertions(+), 20 deletions(-)
 create mode 100644 .github/workflows/pr-labeler.yml
 delete mode 100644 .github/workflows/pr-labeller.yml

diff --git a/.github/workflows/pr-labeler.yml b/.github/workflows/pr-labeler.yml
new file mode 100644
index 00000000000..1f9488f2680
--- /dev/null
+++ b/.github/workflows/pr-labeler.yml
@@ -0,0 +1,33 @@
+name: PR Labeler
+
+on:
+  pull_request_target:
+    branches:
+      - main
+
+permissions:
+  contents: read
+  pull-requests: write
+
+jobs:
+  validate_pr_title:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check PR title for Conventional Commits
+        env:
+          PR_TITLE: ${{ github.event.pull_request.title }}
+        run: |
+          echo "PR Title: $PR_TITLE"
+          if [[ ! "$PR_TITLE" =~ ^(feat|fix|docs|test|ci|refactor|perf|chore|revert|build)(\(.+\))?:\ .+ ]]; then
+            echo "::error::❌ Your PR title does not follow the Conventional Commits format.
+              This check ensures that all pull requests use clear, consistent titles that help automate changelogs and improve project history.
+
+              Please update your PR title to follow the Conventional Commits style.  
+              Here is a link to a blog explaining the reason why we've included the Conventional Commits style into our PR titles: https://xfuture-blog.com/working-with-conventional-commits
+
+              **Here are some examples of valid PR titles:**
+              - feat: add user authentication
+              - fix(login): handle null password error
+              - docs(readme): update installation instructions"
+            exit 1
+          fi
diff --git a/.github/workflows/pr-labeller.yml b/.github/workflows/pr-labeller.yml
deleted file mode 100644
index 3e3b111d151..00000000000
--- a/.github/workflows/pr-labeller.yml
+++ /dev/null
@@ -1,20 +0,0 @@
-name: PR Labeller
-
-on:
-  pull_request_target:
-    branches: [main]
-    types: [opened, reopened, synchronize, edited, labeled, unlabeled]
-
-permissions:
-  contents: read
-  pull-requests: write
-
-jobs:
-  validate_pr_title:
-    runs-on: ubuntu-latest
-    steps:
-    - name: PR Conventional Commit Validation
-      uses: ytanikin/pr-conventional-commits@1.4.0
-      with:
-        task_types: '["feat","fix","docs","test","ci","refactor","perf","chore","revert","build"]'
-        add_label: 'true'

From 8e43ea4d5ed53304e8ccf519f843e8a044d02000 Mon Sep 17 00:00:00 2001
From: Raunak Bhagat <r@rabh.io>
Date: Tue, 22 Jul 2025 15:04:37 -0700
Subject: [PATCH 19/78] fix: Make pr-labeler run on edits too

---
 .github/workflows/pr-labeler.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/pr-labeler.yml b/.github/workflows/pr-labeler.yml
index 1f9488f2680..8fba3bcdd6b 100644
--- a/.github/workflows/pr-labeler.yml
+++ b/.github/workflows/pr-labeler.yml
@@ -4,6 +4,11 @@ on:
   pull_request_target:
     branches:
       - main
+    types:
+      - opened
+      - reopened
+      - synchronize
+      - edited
 
 permissions:
   contents: read

From 768e3e9bbabaa90e0782296c15e50abaeed6951a Mon Sep 17 00:00:00 2001
From: Evan Lohn <evan@danswer.ai>
Date: Tue, 22 Jul 2025 15:19:02 -0700
Subject: [PATCH 20/78] fix: time discrepancy (#5056)

* fix time discrepancy

* remove log

* remove log
---
 .vscode/launch.template.jsonc                         |  2 --
 .../background/celery/tasks/docprocessing/tasks.py    |  5 ++++-
 backend/onyx/server/documents/connector.py            | 11 +++--------
 3 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/.vscode/launch.template.jsonc b/.vscode/launch.template.jsonc
index 166d853e304..a6adc7438f9 100644
--- a/.vscode/launch.template.jsonc
+++ b/.vscode/launch.template.jsonc
@@ -26,7 +26,6 @@
           "Celery heavy",
           "Celery docfetching",
           "Celery docprocessing",
-          "Celery user files indexing",
           "Celery beat",
           "Celery monitoring"
         ],
@@ -49,7 +48,6 @@
           "Celery heavy",
           "Celery docfetching",
           "Celery docprocessing",
-          "Celery user files indexing",
           "Celery beat",
           "Celery monitoring"
         ],
diff --git a/backend/onyx/background/celery/tasks/docprocessing/tasks.py b/backend/onyx/background/celery/tasks/docprocessing/tasks.py
index 8561d729b31..d4c2f7ecd6b 100644
--- a/backend/onyx/background/celery/tasks/docprocessing/tasks.py
+++ b/backend/onyx/background/celery/tasks/docprocessing/tasks.py
@@ -428,7 +428,10 @@ def check_indexing_completion(
             )
 
         if attempt.status.is_successful():
-            cc_pair.last_successful_index_time = attempt.poll_range_end
+            # NOTE: we define the last successful index time as the time the last successful
+            # attempt finished. This is distinct from the poll_range_end of the last successful
+            # attempt, which is the time up to which documents have been fetched.
+            cc_pair.last_successful_index_time = attempt.time_updated
             if cc_pair.status in [
                 ConnectorCredentialPairStatus.SCHEDULED,
                 ConnectorCredentialPairStatus.INITIAL_INDEXING,
diff --git a/backend/onyx/server/documents/connector.py b/backend/onyx/server/documents/connector.py
index 281cadc65c9..317a1d37a42 100644
--- a/backend/onyx/server/documents/connector.py
+++ b/backend/onyx/server/documents/connector.py
@@ -101,8 +101,6 @@
 from onyx.db.models import IndexingStatus
 from onyx.db.models import User
 from onyx.db.models import UserGroup__ConnectorCredentialPair
-from onyx.db.search_settings import get_current_search_settings
-from onyx.db.search_settings import get_secondary_search_settings
 from onyx.file_processing.extract_file_text import convert_docx_to_txt
 from onyx.file_store.file_store import get_default_file_store
 from onyx.key_value_store.interface import KvKeyNotFoundError
@@ -711,6 +709,9 @@ def get_connector_indexing_status(
     )
     cc_pairs = cast(list[ConnectorCredentialPair], cc_pairs)
     latest_index_attempts = cast(list[IndexAttempt], latest_index_attempts)
+    latest_finished_index_attempts = cast(
+        list[IndexAttempt], latest_finished_index_attempts
+    )
 
     cc_pair_to_latest_index_attempt = {
         (
@@ -768,12 +769,6 @@ def get_connector_indexing_status(
     for cc_pair in cc_pairs:
         connector_to_cc_pair_ids.setdefault(cc_pair.connector_id, []).append(cc_pair.id)
 
-    get_search_settings = (
-        get_secondary_search_settings
-        if secondary_index
-        else get_current_search_settings
-    )
-    get_search_settings(db_session)
     for cc_pair in cc_pairs:
         # TODO remove this to enable ingestion API
         if cc_pair.name == "DefaultCCPair":

From d245f53e82c7eede46c3e178319855c4c7291dfd Mon Sep 17 00:00:00 2001
From: Evan Lohn <evan@danswer.ai>
Date: Tue, 22 Jul 2025 15:35:59 -0700
Subject: [PATCH 21/78] handle empty doc batches (#5058)

---
 .vscode/launch.template.jsonc                       | 2 +-
 backend/onyx/background/indexing/run_docfetching.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.vscode/launch.template.jsonc b/.vscode/launch.template.jsonc
index a6adc7438f9..a7008d92fb7 100644
--- a/.vscode/launch.template.jsonc
+++ b/.vscode/launch.template.jsonc
@@ -428,7 +428,7 @@
       },
       "args": [
         "--filename",
-        "generated/openapi.json",
+        "generated/openapi.json"
       ]
     },
     {
diff --git a/backend/onyx/background/indexing/run_docfetching.py b/backend/onyx/background/indexing/run_docfetching.py
index 10c37144f34..b542d2f8fec 100644
--- a/backend/onyx/background/indexing/run_docfetching.py
+++ b/backend/onyx/background/indexing/run_docfetching.py
@@ -1146,7 +1146,7 @@ def connector_document_extraction(
                     checkpoint = next_checkpoint
 
                 # below is all document processing task, so if no batch we can just continue
-                if document_batch is None:
+                if not document_batch:
                     continue
 
                 # Clean documents and create batch

From ccdfc5120442efdd39276e4d3491c32a8db16fdb Mon Sep 17 00:00:00 2001
From: Wenxi <wenxi@onyx.app>
Date: Tue, 22 Jul 2025 16:11:10 -0700
Subject: [PATCH 22/78] fix: too many internet chunks (#5060)

* minor internet search env vars

* add limit to internet search chunks

* note

* nits
---
 backend/onyx/configs/chat_configs.py          |  3 ++
 backend/onyx/tools/tool_constructor.py        |  5 +-
 .../internet_search/internet_search_tool.py   | 54 ++++++++-----------
 3 files changed, 28 insertions(+), 34 deletions(-)

diff --git a/backend/onyx/configs/chat_configs.py b/backend/onyx/configs/chat_configs.py
index cb7cffd9bff..e85ee8986e4 100644
--- a/backend/onyx/configs/chat_configs.py
+++ b/backend/onyx/configs/chat_configs.py
@@ -93,6 +93,9 @@
 BING_API_KEY = os.environ.get("BING_API_KEY") or None
 EXA_API_KEY = os.environ.get("EXA_API_KEY") or None
 
+NUM_INTERNET_SEARCH_RESULTS = int(os.environ.get("NUM_INTERNET_SEARCH_RESULTS") or 10)
+NUM_INTERNET_SEARCH_CHUNKS = int(os.environ.get("NUM_INTERNET_SEARCH_CHUNKS") or 50)
+
 # Enable in-house model for detecting connector-based filtering in queries
 ENABLE_CONNECTOR_CLASSIFIER = os.environ.get("ENABLE_CONNECTOR_CLASSIFIER", False)
 
diff --git a/backend/onyx/tools/tool_constructor.py b/backend/onyx/tools/tool_constructor.py
index 567ab4b531a..dd3d080a0e3 100644
--- a/backend/onyx/tools/tool_constructor.py
+++ b/backend/onyx/tools/tool_constructor.py
@@ -14,6 +14,8 @@
 from onyx.configs.app_configs import AZURE_DALLE_API_VERSION
 from onyx.configs.app_configs import AZURE_DALLE_DEPLOYMENT_NAME
 from onyx.configs.app_configs import IMAGE_MODEL_NAME
+from onyx.configs.chat_configs import NUM_INTERNET_SEARCH_CHUNKS
+from onyx.configs.chat_configs import NUM_INTERNET_SEARCH_RESULTS
 from onyx.configs.model_configs import GEN_AI_TEMPERATURE
 from onyx.context.search.enums import LLMEvaluationType
 from onyx.context.search.enums import OptionalSearchSetting
@@ -253,7 +255,8 @@ def construct_tools(
                             document_pruning_config=internet_search_tool_config.document_pruning_config,
                             answer_style_config=internet_search_tool_config.answer_style_config,
                             provider=None,  # Will use default provider
-                            num_results=10,
+                            num_results=NUM_INTERNET_SEARCH_RESULTS,
+                            max_chunks=NUM_INTERNET_SEARCH_CHUNKS,
                         )
                     ]
                 except ValueError as e:
diff --git a/backend/onyx/tools/tool_implementations/internet_search/internet_search_tool.py b/backend/onyx/tools/tool_implementations/internet_search/internet_search_tool.py
index 34861b16c2a..b8a07ff7f47 100644
--- a/backend/onyx/tools/tool_implementations/internet_search/internet_search_tool.py
+++ b/backend/onyx/tools/tool_implementations/internet_search/internet_search_tool.py
@@ -19,6 +19,8 @@
 from onyx.chat.prune_and_merge import prune_and_merge_sections
 from onyx.configs.chat_configs import CONTEXT_CHUNKS_ABOVE
 from onyx.configs.chat_configs import CONTEXT_CHUNKS_BELOW
+from onyx.configs.chat_configs import NUM_INTERNET_SEARCH_CHUNKS
+from onyx.configs.chat_configs import NUM_INTERNET_SEARCH_RESULTS
 from onyx.configs.constants import DocumentSource
 from onyx.configs.model_configs import GEN_AI_MODEL_FALLBACK_MAX_TOKENS
 from onyx.connectors.models import Document
@@ -26,7 +28,6 @@
 from onyx.context.search.enums import SearchType
 from onyx.context.search.models import InferenceChunk
 from onyx.context.search.models import InferenceSection
-from onyx.context.search.utils import inference_section_from_chunks
 from onyx.db.models import Persona
 from onyx.db.search_settings import get_current_search_settings
 from onyx.indexing.chunker import Chunker
@@ -95,12 +96,14 @@ def __init__(
         document_pruning_config: DocumentPruningConfig,
         answer_style_config: AnswerStyleConfig,
         provider: str | None = None,
-        num_results: int = 10,
+        num_results: int = NUM_INTERNET_SEARCH_RESULTS,
+        max_chunks: int = NUM_INTERNET_SEARCH_CHUNKS,
     ) -> None:
         self.db_session = db_session
         self.persona = persona
         self.prompt_config = prompt_config
         self.llm = llm
+        self.max_chunks = max_chunks
 
         self.chunks_above = (
             persona.chunks_above
@@ -317,7 +320,7 @@ def _create_inference_chunk_from_index_chunk(
             image_file_id=None,
         )
 
-    def _combine_chunks_into_sections(
+    def _convert_chunks_into_sections(
         self, chunks: list[IndexChunk], similarity_scores: dict[str, float]
     ) -> list[InferenceSection]:
         inference_chunks: list[InferenceChunk] = []
@@ -333,37 +336,23 @@ def _combine_chunks_into_sections(
             )
             inference_chunks.append(inference_chunk)
 
-        # Group chunks by document ID
-        doc_chunks_map: dict[str, list[InferenceChunk]] = {}
-        for inference_chunk in inference_chunks:
-            if inference_chunk.document_id not in doc_chunks_map:
-                doc_chunks_map[inference_chunk.document_id] = []
-            doc_chunks_map[inference_chunk.document_id].append(inference_chunk)
+        # Limit to max_chunks results to process
+        sorted_inference_chunks = sorted(
+            inference_chunks, key=lambda x: x.score or 0, reverse=True
+        )
+        sorted_inference_chunks = sorted_inference_chunks[: self.max_chunks]
 
-        # Create sections for each document
+        # NOTE: chunks_above and chunks_below are set to 0
+        # If we ever decide to use them, we need to add that logic to the inference section
+        # Section merging/pruning happens after this in run()
         sections: list[InferenceSection] = []
-        for _, doc_chunks in doc_chunks_map.items():
-            # Sort chunks by chunk_id to maintain order
-            sorted_chunks = sorted(doc_chunks, key=lambda x: x.chunk_id)
-
-            # Use the chunk with highest score as the center chunk
-            if all(chunk.score is None for chunk in doc_chunks):
-                # If all scores are None, use the first chunk as center
-                center_chunk = doc_chunks[0]
-            else:
-                center_chunk = max(doc_chunks, key=lambda x: x.score or 0)
-
-            # Create section using the utility function
-            section = inference_section_from_chunks(
-                center_chunk=center_chunk,
-                chunks=sorted_chunks,
+        for inference_chunk in sorted_inference_chunks:
+            new_section = InferenceSection(
+                center_chunk=inference_chunk,
+                chunks=[inference_chunk],
+                combined_content=inference_chunk.content,
             )
-
-            if section is not None:
-                sections.append(section)
-
-        # Sort sections by center chunk score (highest first)
-        sections.sort(key=lambda x: x.center_chunk.score or 0, reverse=True)
+            sections.append(new_section)
 
         return sections
 
@@ -395,11 +384,10 @@ def run(
         similarity_scores = self._calculate_cosine_similarity_scores(
             query_embedding, chunks_with_embeddings
         )
-        sections = self._combine_chunks_into_sections(
+        sections = self._convert_chunks_into_sections(
             chunks_with_embeddings, similarity_scores
         )
 
-        # Apply pruning and merging to fit within token budget
         if sections:
             pruned_sections = prune_and_merge_sections(
                 sections=sections,

From a0e2a7f17d67e488d4b029e2306314a1391592f0 Mon Sep 17 00:00:00 2001
From: Evan Lohn <evan@danswer.ai>
Date: Tue, 22 Jul 2025 16:24:42 -0700
Subject: [PATCH 23/78] fix: remove extra group sync (#5061)

* fix: remove extra group sync

* second extra task
---
 .../background/celery/tasks/beat_schedule.py   | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/backend/onyx/background/celery/tasks/beat_schedule.py b/backend/onyx/background/celery/tasks/beat_schedule.py
index 09bd8c04927..8b277f1cac1 100644
--- a/backend/onyx/background/celery/tasks/beat_schedule.py
+++ b/backend/onyx/background/celery/tasks/beat_schedule.py
@@ -100,24 +100,6 @@
             "expires": BEAT_EXPIRES_DEFAULT,
         },
     },
-    {
-        "name": "check-for-doc-permissions-sync",
-        "task": OnyxCeleryTask.CHECK_FOR_DOC_PERMISSIONS_SYNC,
-        "schedule": timedelta(seconds=30),
-        "options": {
-            "priority": OnyxCeleryPriority.MEDIUM,
-            "expires": BEAT_EXPIRES_DEFAULT,
-        },
-    },
-    {
-        "name": "check-for-external-group-sync",
-        "task": OnyxCeleryTask.CHECK_FOR_EXTERNAL_GROUP_SYNC,
-        "schedule": timedelta(seconds=20),
-        "options": {
-            "priority": OnyxCeleryPriority.MEDIUM,
-            "expires": BEAT_EXPIRES_DEFAULT,
-        },
-    },
     {
         "name": "monitor-background-processes",
         "task": OnyxCeleryTask.MONITOR_BACKGROUND_PROCESSES,

From bd56d20526880a34e451cc14a51bf2cd9a9ea822 Mon Sep 17 00:00:00 2001
From: Evan Lohn <evan@danswer.ai>
Date: Tue, 22 Jul 2025 20:36:51 -0700
Subject: [PATCH 24/78] fix: regen api key (#5064)

---
 backend/onyx/db/api_key.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/backend/onyx/db/api_key.py b/backend/onyx/db/api_key.py
index 1a992fd5165..b2976abefd9 100644
--- a/backend/onyx/db/api_key.py
+++ b/backend/onyx/db/api_key.py
@@ -148,7 +148,10 @@ def regenerate_api_key(db_session: Session, api_key_id: int) -> ApiKeyDescriptor
     if api_key_user is None:
         raise RuntimeError("API Key does not have associated user.")
 
-    new_api_key = generate_api_key()
+    # Get tenant_id from context var (will be default schema for single tenant)
+    tenant_id = get_current_tenant_id()
+
+    new_api_key = generate_api_key(tenant_id)
     existing_api_key.hashed_api_key = hash_api_key(new_api_key)
     existing_api_key.api_key_display = build_displayable_api_key(new_api_key)
     db_session.commit()

From c8955cfca847b378e5e3f1f928085d1a4ec25852 Mon Sep 17 00:00:00 2001
From: Evan Lohn <evan@danswer.ai>
Date: Wed, 23 Jul 2025 11:01:23 -0700
Subject: [PATCH 25/78] feat: avoid full rerun (#5063)

* fix: remove extra group sync

* second extra task

* minor improvement for non-checkpointed connectors
---
 backend/onyx/background/indexing/run_docfetching.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/backend/onyx/background/indexing/run_docfetching.py b/backend/onyx/background/indexing/run_docfetching.py
index b542d2f8fec..28e70865c87 100644
--- a/backend/onyx/background/indexing/run_docfetching.py
+++ b/backend/onyx/background/indexing/run_docfetching.py
@@ -1060,9 +1060,14 @@ def connector_document_extraction(
                 connector=connector_runner.connector,
             )
 
+            # checkpoint resumption OR the connector already finished.
             if (
                 isinstance(connector_runner.connector, CheckpointedConnector)
                 and resuming_from_checkpoint
+            ) or (
+                most_recent_attempt
+                and most_recent_attempt.total_batches is not None
+                and not checkpoint.has_more
             ):
                 reissued_batch_count, completed_batches = reissue_old_batches(
                     batch_storage,

From fe432eefb333ca64ea9fac722b5610b8358ffdba Mon Sep 17 00:00:00 2001
From: Devin <49892118+Dcooley1350@users.noreply.github.com>
Date: Wed, 23 Jul 2025 13:37:42 -0700
Subject: [PATCH 26/78] fix: explicit api_server dependency on minio in docker
 compose files (#5066)

---
 deployment/docker_compose/docker-compose.dev.yml                 | 1 +
 deployment/docker_compose/docker-compose.gpu-dev.yml             | 1 +
 deployment/docker_compose/docker-compose.multitenant-dev.yml     | 1 +
 deployment/docker_compose/docker-compose.prod-cloud.yml          | 1 +
 deployment/docker_compose/docker-compose.prod-no-letsencrypt.yml | 1 +
 deployment/docker_compose/docker-compose.prod.yml                | 1 +
 deployment/docker_compose/docker-compose.search-testing.yml      | 1 +
 7 files changed, 7 insertions(+)

diff --git a/deployment/docker_compose/docker-compose.dev.yml b/deployment/docker_compose/docker-compose.dev.yml
index d5b4571d2b9..ed4ca261d49 100644
--- a/deployment/docker_compose/docker-compose.dev.yml
+++ b/deployment/docker_compose/docker-compose.dev.yml
@@ -13,6 +13,7 @@ services:
       - index
       - cache
       - inference_model_server
+      - minio
     restart: unless-stopped
     ports:
       - "8080:8080"
diff --git a/deployment/docker_compose/docker-compose.gpu-dev.yml b/deployment/docker_compose/docker-compose.gpu-dev.yml
index be6d5598b9b..6a98bcb8818 100644
--- a/deployment/docker_compose/docker-compose.gpu-dev.yml
+++ b/deployment/docker_compose/docker-compose.gpu-dev.yml
@@ -13,6 +13,7 @@ services:
       - index
       - cache
       - inference_model_server
+      - minio
     restart: unless-stopped
     ports:
       - "8080:8080"
diff --git a/deployment/docker_compose/docker-compose.multitenant-dev.yml b/deployment/docker_compose/docker-compose.multitenant-dev.yml
index 6f37e96bb81..53084063f4c 100644
--- a/deployment/docker_compose/docker-compose.multitenant-dev.yml
+++ b/deployment/docker_compose/docker-compose.multitenant-dev.yml
@@ -14,6 +14,7 @@ services:
       - index
       - cache
       - inference_model_server
+      - minio
     restart: unless-stopped
     ports:
       - "8080:8080"
diff --git a/deployment/docker_compose/docker-compose.prod-cloud.yml b/deployment/docker_compose/docker-compose.prod-cloud.yml
index a70961e9d55..1cf2b9c6a1b 100644
--- a/deployment/docker_compose/docker-compose.prod-cloud.yml
+++ b/deployment/docker_compose/docker-compose.prod-cloud.yml
@@ -13,6 +13,7 @@ services:
       - index
       - cache
       - inference_model_server
+      - minio
     restart: unless-stopped
     env_file:
       - .env
diff --git a/deployment/docker_compose/docker-compose.prod-no-letsencrypt.yml b/deployment/docker_compose/docker-compose.prod-no-letsencrypt.yml
index 1529d799661..1e7b9a48127 100644
--- a/deployment/docker_compose/docker-compose.prod-no-letsencrypt.yml
+++ b/deployment/docker_compose/docker-compose.prod-no-letsencrypt.yml
@@ -13,6 +13,7 @@ services:
       - index
       - cache
       - inference_model_server
+      - minio
     restart: unless-stopped
     env_file:
       - .env
diff --git a/deployment/docker_compose/docker-compose.prod.yml b/deployment/docker_compose/docker-compose.prod.yml
index 6fe341e0b7f..0a4defc5629 100644
--- a/deployment/docker_compose/docker-compose.prod.yml
+++ b/deployment/docker_compose/docker-compose.prod.yml
@@ -13,6 +13,7 @@ services:
       - relational_db
       - index
       - cache
+      - minio
       - inference_model_server
     restart: unless-stopped
     env_file:
diff --git a/deployment/docker_compose/docker-compose.search-testing.yml b/deployment/docker_compose/docker-compose.search-testing.yml
index 944c2889a36..d580308d862 100644
--- a/deployment/docker_compose/docker-compose.search-testing.yml
+++ b/deployment/docker_compose/docker-compose.search-testing.yml
@@ -12,6 +12,7 @@ services:
       - relational_db
       - index
       - cache
+      - minio
     restart: unless-stopped
     ports:
       - "8080"

From e17fcee91b344bf6c37baf6c6003c465f9ff0335 Mon Sep 17 00:00:00 2001
From: PaulHLiatrio <100874415+PaulHLiatrio@users.noreply.github.com>
Date: Wed, 23 Jul 2025 14:54:32 -0700
Subject: [PATCH 27/78] fix: adjust template variable from .Chart.AppVersion to
 .Values.global.version to match versioning pattern. (#5069)

---
 .../helm/charts/onyx/templates/celery-worker-docfetching.yaml   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deployment/helm/charts/onyx/templates/celery-worker-docfetching.yaml b/deployment/helm/charts/onyx/templates/celery-worker-docfetching.yaml
index 7b46d837fc4..39dd88c7b39 100644
--- a/deployment/helm/charts/onyx/templates/celery-worker-docfetching.yaml
+++ b/deployment/helm/charts/onyx/templates/celery-worker-docfetching.yaml
@@ -37,7 +37,7 @@ spec:
         - name: celery-worker-docfetching
           securityContext:
             {{- toYaml .Values.celery_worker_docfetching.securityContext | nindent 12 }}
-          image: "{{ .Values.celery_shared.image.repository }}:{{ .Values.celery_shared.image.tag | default .Chart.AppVersion }}"
+          image: "{{ .Values.celery_shared.image.repository }}:{{ .Values.celery_shared.image.tag | default .Values.global.version }}"
           imagePullPolicy: {{ .Values.global.pullPolicy }}
           command:
             [

From 9d8b04b8f6251c3bd03fff9b158b3d5ce3798d1a Mon Sep 17 00:00:00 2001
From: Raunak Bhagat <r@rabh.io>
Date: Wed, 23 Jul 2025 14:59:34 -0700
Subject: [PATCH 28/78] refactor: Update location of `sidebar` (#5067)

* Use props instead of inline type def

* Add new AppProvider

* Remove unused component file

* Move `sessionSidebar` to be inside of `components` instead of `app/chat`

* Change name of `sessionSidebar` to `sidebar`

* Remove `AppModeProvider`

* Fix bug in how the cookies were set
---
 web/src/app/assistants/SidebarWrapper.tsx     | 10 ++++------
 web/src/app/chat/ChatPage.tsx                 | 20 ++++++++++---------
 web/src/app/chat/folders/FolderList.tsx       |  2 +-
 web/src/app/chat/sessionSidebar/ChatGroup.tsx | 20 -------------------
 web/src/app/providers.tsx                     |  4 +++-
 web/src/components/UserDropdown.tsx           |  2 +-
 web/src/components/chat/Header.tsx            |  2 +-
 web/src/components/header/LogoWithText.tsx    |  2 +-
 .../sidebar}/ChatSessionDisplay.tsx           |  6 +++---
 .../sidebar}/HistorySidebar.tsx               |  6 +++---
 .../sidebar}/PagesTab.tsx                     | 10 +++++-----
 .../sidebar}/types.ts                         |  0
 12 files changed, 33 insertions(+), 51 deletions(-)
 delete mode 100644 web/src/app/chat/sessionSidebar/ChatGroup.tsx
 rename web/src/{app/chat/sessionSidebar => components/sidebar}/ChatSessionDisplay.tsx (98%)
 rename web/src/{app/chat/sessionSidebar => components/sidebar}/HistorySidebar.tsx (98%)
 rename web/src/{app/chat/sessionSidebar => components/sidebar}/PagesTab.tsx (98%)
 rename web/src/{app/chat/sessionSidebar => components/sidebar}/types.ts (100%)

diff --git a/web/src/app/assistants/SidebarWrapper.tsx b/web/src/app/assistants/SidebarWrapper.tsx
index 3b700b34f46..9d3b6d78265 100644
--- a/web/src/app/assistants/SidebarWrapper.tsx
+++ b/web/src/app/assistants/SidebarWrapper.tsx
@@ -9,7 +9,7 @@ import { useRouter } from "next/navigation";
 import FixedLogo from "../../components/logo/FixedLogo";
 import { SettingsContext } from "@/components/settings/SettingsProvider";
 import { useChatContext } from "@/components/context/ChatContext";
-import { HistorySidebar } from "../chat/sessionSidebar/HistorySidebar";
+import { HistorySidebar } from "@/components/sidebar/HistorySidebar";
 import { useAssistants } from "@/components/context/AssistantsContext";
 import AssistantModal from "./mine/AssistantModal";
 import { useSidebarShortcut } from "@/lib/browserUtilities";
@@ -35,11 +35,9 @@ export default function SidebarWrapper<T extends object>({
   const toggleSidebar = useCallback(() => {
     Cookies.set(
       SIDEBAR_TOGGLED_COOKIE_NAME,
-      String(!sidebarVisible).toLocaleLowerCase()
-    ),
-      {
-        path: "/",
-      };
+      String(!sidebarVisible).toLocaleLowerCase(),
+      { path: "/" }
+    );
     setSidebarVisible((sidebarVisible) => !sidebarVisible);
   }, [sidebarVisible]);
 
diff --git a/web/src/app/chat/ChatPage.tsx b/web/src/app/chat/ChatPage.tsx
index 81052d5dd45..475e0102d58 100644
--- a/web/src/app/chat/ChatPage.tsx
+++ b/web/src/app/chat/ChatPage.tsx
@@ -28,7 +28,7 @@ import {
 
 import Prism from "prismjs";
 import Cookies from "js-cookie";
-import { HistorySidebar } from "./sessionSidebar/HistorySidebar";
+import { HistorySidebar } from "@/components/sidebar/HistorySidebar";
 import { MinimalPersonaSnapshot } from "../admin/assistants/interfaces";
 import { HealthCheckBanner } from "@/components/health/healthcheck";
 import {
@@ -150,6 +150,15 @@ export enum UploadIntent {
   ADD_TO_DOCUMENTS, // For files uploaded via FilePickerModal or similar (just add to repo)
 }
 
+type ChatPageProps = {
+  toggle: (toggled?: boolean) => void;
+  documentSidebarInitialWidth?: number;
+  sidebarVisible: boolean;
+  firstMessage?: string;
+  initialFolders?: any;
+  initialFiles?: any;
+};
+
 // ---
 // File Attachment Behavior in ChatPage
 //
@@ -171,14 +180,7 @@ export function ChatPage({
   firstMessage,
   initialFolders,
   initialFiles,
-}: {
-  toggle: (toggled?: boolean) => void;
-  documentSidebarInitialWidth?: number;
-  sidebarVisible: boolean;
-  firstMessage?: string;
-  initialFolders?: any;
-  initialFiles?: any;
-}) {
+}: ChatPageProps) {
   const router = useRouter();
   const searchParams = useSearchParams();
 
diff --git a/web/src/app/chat/folders/FolderList.tsx b/web/src/app/chat/folders/FolderList.tsx
index f92a0450d67..89d9f08a756 100644
--- a/web/src/app/chat/folders/FolderList.tsx
+++ b/web/src/app/chat/folders/FolderList.tsx
@@ -2,7 +2,7 @@
 
 import React, { useState, useEffect, useRef } from "react";
 import { Folder } from "./interfaces";
-import { ChatSessionDisplay } from "../sessionSidebar/ChatSessionDisplay"; // Ensure this is correctly imported
+import { ChatSessionDisplay } from "@/components/sidebar/ChatSessionDisplay"; // Ensure this is correctly imported
 import {
   FiChevronDown,
   FiChevronRight,
diff --git a/web/src/app/chat/sessionSidebar/ChatGroup.tsx b/web/src/app/chat/sessionSidebar/ChatGroup.tsx
deleted file mode 100644
index c9b98b25bd2..00000000000
--- a/web/src/app/chat/sessionSidebar/ChatGroup.tsx
+++ /dev/null
@@ -1,20 +0,0 @@
-import { useRouter } from "next/router";
-import { ChatSession } from "../interfaces";
-
-export const ChatGroup = ({
-  groupName,
-  toggled,
-  chatSessions,
-}: {
-  groupName: string;
-  toggled: boolean;
-  chatSessions: ChatSession[];
-}) => {
-  const router = useRouter();
-
-  return toggled ? (
-    <div>
-      <p>{groupName}</p>
-    </div>
-  ) : null;
-};
diff --git a/web/src/app/providers.tsx b/web/src/app/providers.tsx
index bf5a296abae..c278694203d 100644
--- a/web/src/app/providers.tsx
+++ b/web/src/app/providers.tsx
@@ -7,7 +7,9 @@ const isPostHogEnabled = !!(
   process.env.NEXT_PUBLIC_POSTHOG_KEY && process.env.NEXT_PUBLIC_POSTHOG_HOST
 );
 
-export function PHProvider({ children }: { children: React.ReactNode }) {
+type PHProviderProps = { children: React.ReactNode };
+
+export function PHProvider({ children }: PHProviderProps) {
   useEffect(() => {
     if (isPostHogEnabled) {
       posthog.init(process.env.NEXT_PUBLIC_POSTHOG_KEY!, {
diff --git a/web/src/components/UserDropdown.tsx b/web/src/components/UserDropdown.tsx
index 2fadb894991..56631ba3af6 100644
--- a/web/src/components/UserDropdown.tsx
+++ b/web/src/components/UserDropdown.tsx
@@ -10,7 +10,7 @@ import { Popover } from "./popover/Popover";
 import { LOGOUT_DISABLED } from "@/lib/constants";
 import { SettingsContext } from "./settings/SettingsProvider";
 import { BellIcon, LightSettingsIcon, UserIcon } from "./icons/icons";
-import { pageType } from "@/app/chat/sessionSidebar/types";
+import { pageType } from "@/components/sidebar/types";
 import { NavigationItem, Notification } from "@/app/admin/settings/interfaces";
 import DynamicFaIcon, { preloadIcons } from "./icons/DynamicFaIcon";
 import { useUser } from "./user/UserProvider";
diff --git a/web/src/components/chat/Header.tsx b/web/src/components/chat/Header.tsx
index fbbeaad5227..afae5ed19d4 100644
--- a/web/src/components/chat/Header.tsx
+++ b/web/src/components/chat/Header.tsx
@@ -4,7 +4,7 @@ import { FiShare2 } from "react-icons/fi";
 import { SetStateAction, useContext, useEffect } from "react";
 import { ChatSession } from "@/app/chat/interfaces";
 import Link from "next/link";
-import { pageType } from "@/app/chat/sessionSidebar/types";
+import { pageType } from "@/components/sidebar/types";
 import { useRouter } from "next/navigation";
 import { ChatBanner } from "@/app/chat/ChatBanner";
 import LogoWithText from "../header/LogoWithText";
diff --git a/web/src/components/header/LogoWithText.tsx b/web/src/components/header/LogoWithText.tsx
index 89ec3346824..98a304b841c 100644
--- a/web/src/components/header/LogoWithText.tsx
+++ b/web/src/components/header/LogoWithText.tsx
@@ -9,7 +9,7 @@ import {
   TooltipProvider,
   TooltipTrigger,
 } from "@/components/ui/tooltip";
-import { pageType } from "@/app/chat/sessionSidebar/types";
+import { pageType } from "@/components/sidebar/types";
 import { Logo } from "../logo/Logo";
 import Link from "next/link";
 import { LogoComponent } from "@/components/logo/FixedLogo";
diff --git a/web/src/app/chat/sessionSidebar/ChatSessionDisplay.tsx b/web/src/components/sidebar/ChatSessionDisplay.tsx
similarity index 98%
rename from web/src/app/chat/sessionSidebar/ChatSessionDisplay.tsx
rename to web/src/components/sidebar/ChatSessionDisplay.tsx
index bdcfaeb74a9..c6f64d14741 100644
--- a/web/src/app/chat/sessionSidebar/ChatSessionDisplay.tsx
+++ b/web/src/components/sidebar/ChatSessionDisplay.tsx
@@ -1,13 +1,13 @@
 "use client";
 
 import { useRouter } from "next/navigation";
-import { ChatSession } from "../interfaces";
+import { ChatSession } from "@/app/chat/interfaces";
 import { useState, useEffect, useContext, useRef, useCallback } from "react";
 import {
   deleteChatSession,
   getChatRetentionInfo,
   renameChatSession,
-} from "../lib";
+} from "@/app/chat/lib";
 import { BasicSelectable } from "@/components/BasicClickable";
 import Link from "next/link";
 import {
@@ -20,7 +20,7 @@ import {
 } from "react-icons/fi";
 import { DefaultDropdownElement } from "@/components/Dropdown";
 import { Popover } from "@/components/popover/Popover";
-import { ShareChatSessionModal } from "../modal/ShareChatSessionModal";
+import { ShareChatSessionModal } from "@/app/chat/modal/ShareChatSessionModal";
 import { CHAT_SESSION_ID_KEY, FOLDER_ID_KEY } from "@/lib/drag/constants";
 import { SettingsContext } from "@/components/settings/SettingsProvider";
 import { DragHandle } from "@/components/table/DragHandle";
diff --git a/web/src/app/chat/sessionSidebar/HistorySidebar.tsx b/web/src/components/sidebar/HistorySidebar.tsx
similarity index 98%
rename from web/src/app/chat/sessionSidebar/HistorySidebar.tsx
rename to web/src/components/sidebar/HistorySidebar.tsx
index 0905a68bdb5..130ea332315 100644
--- a/web/src/app/chat/sessionSidebar/HistorySidebar.tsx
+++ b/web/src/components/sidebar/HistorySidebar.tsx
@@ -15,8 +15,8 @@ import {
 } from "@/components/ui/tooltip";
 
 import { useRouter, useSearchParams } from "next/navigation";
-import { ChatSession } from "../interfaces";
-import { Folder } from "../folders/interfaces";
+import { ChatSession } from "@/app/chat/interfaces";
+import { Folder } from "@/app/chat/folders/interfaces";
 import { SettingsContext } from "@/components/settings/SettingsProvider";
 
 import {
@@ -31,7 +31,7 @@ import { MinimalPersonaSnapshot } from "@/app/admin/assistants/interfaces";
 import { DragEndEvent } from "@dnd-kit/core";
 import { useAssistants } from "@/components/context/AssistantsContext";
 import { AssistantIcon } from "@/components/assistants/AssistantIcon";
-import { buildChatUrl } from "../lib";
+import { buildChatUrl } from "@/app/chat/lib";
 import { reorderPinnedAssistants } from "@/lib/assistants/updateAssistantPreferences";
 import { useUser } from "@/components/user/UserProvider";
 import { DragHandle } from "@/components/table/DragHandle";
diff --git a/web/src/app/chat/sessionSidebar/PagesTab.tsx b/web/src/components/sidebar/PagesTab.tsx
similarity index 98%
rename from web/src/app/chat/sessionSidebar/PagesTab.tsx
rename to web/src/components/sidebar/PagesTab.tsx
index f282dd03e4c..c433968a7d9 100644
--- a/web/src/app/chat/sessionSidebar/PagesTab.tsx
+++ b/web/src/components/sidebar/PagesTab.tsx
@@ -1,20 +1,20 @@
-import { ChatSession } from "../interfaces";
+import { ChatSession } from "@/app/chat/interfaces";
 import {
   createFolder,
   updateFolderName,
   deleteFolder,
   addChatToFolder,
   updateFolderDisplayPriorities,
-} from "../folders/FolderManagement";
-import { Folder } from "../folders/interfaces";
+} from "@/app/chat/folders/FolderManagement";
+import { Folder } from "@/app/chat/folders/interfaces";
 import { usePopup } from "@/components/admin/connectors/Popup";
 import { useRouter } from "next/navigation";
 import { FiPlus, FiCheck, FiX } from "react-icons/fi";
-import { FolderDropdown } from "../folders/FolderDropdown";
+import { FolderDropdown } from "@/app/chat/folders/FolderDropdown";
 import { ChatSessionDisplay } from "./ChatSessionDisplay";
 import { useState, useCallback, useRef, useContext, useEffect } from "react";
 import { Caret } from "@/components/icons/icons";
-import { groupSessionsByDateRange } from "../lib";
+import { groupSessionsByDateRange } from "@/app/chat/lib";
 import React from "react";
 import {
   Tooltip,
diff --git a/web/src/app/chat/sessionSidebar/types.ts b/web/src/components/sidebar/types.ts
similarity index 100%
rename from web/src/app/chat/sessionSidebar/types.ts
rename to web/src/components/sidebar/types.ts

From 01f1f8bbaaad79e7cb9f339bf38b2229f291cb0e Mon Sep 17 00:00:00 2001
From: Evan Lohn <evan@danswer.ai>
Date: Wed, 23 Jul 2025 16:05:35 -0700
Subject: [PATCH 29/78] fix: remove locks from indexing callback (#5070)

---
 .../celery/tasks/docfetching/tasks.py         | 40 +------------------
 .../celery/tasks/docprocessing/tasks.py       | 39 ------------------
 .../celery/tasks/docprocessing/utils.py       | 17 +++++---
 .../background/indexing/run_docfetching.py    | 11 +----
 .../search_nlp_models.py                      |  4 --
 backend/onyx/redis/redis_connector_index.py   |  3 --
 6 files changed, 14 insertions(+), 100 deletions(-)

diff --git a/backend/onyx/background/celery/tasks/docfetching/tasks.py b/backend/onyx/background/celery/tasks/docfetching/tasks.py
index 1dbd5b63b03..8e5cd8e8f81 100644
--- a/backend/onyx/background/celery/tasks/docfetching/tasks.py
+++ b/backend/onyx/background/celery/tasks/docfetching/tasks.py
@@ -9,7 +9,6 @@
 from celery import Celery
 from celery import shared_task
 from celery import Task
-from redis.lock import Lock as RedisLock
 
 from onyx.background.celery.apps.app_base import task_logger
 from onyx.background.celery.memory_monitoring import emit_process_memory
@@ -24,7 +23,6 @@
 from onyx.background.indexing.job_client import SimpleJobClient
 from onyx.background.indexing.job_client import SimpleJobException
 from onyx.background.indexing.run_docfetching import run_indexing_entrypoint
-from onyx.configs.constants import CELERY_INDEXING_LOCK_TIMEOUT
 from onyx.configs.constants import CELERY_INDEXING_WATCHDOG_CONNECTOR_TIMEOUT
 from onyx.configs.constants import OnyxCeleryTask
 from onyx.connectors.exceptions import ConnectorValidationError
@@ -37,7 +35,6 @@
 from onyx.db.indexing_coordination import IndexingCoordination
 from onyx.redis.redis_connector import RedisConnector
 from onyx.redis.redis_connector_index import RedisConnectorIndex
-from onyx.redis.redis_pool import get_redis_client
 from onyx.utils.logger import setup_logger
 from onyx.utils.variable_functionality import global_version
 from shared_configs.configs import SENTRY_DSN
@@ -159,7 +156,7 @@ def _docfetching_task(
     )
 
     redis_connector = RedisConnector(tenant_id, cc_pair_id)
-    redis_connector_index = redis_connector.new_index(search_settings_id)
+    redis_connector.new_index(search_settings_id)
 
     # TODO: remove all fences, cause all signals to be set in postgres
     if redis_connector.delete.fenced:
@@ -184,34 +181,6 @@ def _docfetching_task(
     # This replaces the Redis fence payload waiting
     _verify_indexing_attempt(index_attempt_id, cc_pair_id, search_settings_id)
 
-    # We still need a basic Redis lock to prevent duplicate task execution
-    # but this is much simpler than the full fencing mechanism
-    r = get_redis_client()
-    # set thread_local=False since we don't control what thread the indexing/pruning
-    # might run our callback with
-    lock: RedisLock = r.lock(
-        redis_connector_index.generator_lock_key,
-        timeout=CELERY_INDEXING_LOCK_TIMEOUT,
-        thread_local=False,
-    )
-
-    acquired = lock.acquire(blocking=False)
-    if not acquired:
-        logger.warning(
-            f"Docfetching task already running, exiting...: "
-            f"index_attempt={index_attempt_id} "
-            f"cc_pair={cc_pair_id} "
-            f"search_settings={search_settings_id}"
-        )
-
-        raise SimpleJobException(
-            f"Docfetching task already running, exiting...: "
-            f"index_attempt={index_attempt_id} "
-            f"cc_pair={cc_pair_id} "
-            f"search_settings={search_settings_id}",
-            code=IndexingWatchdogTerminalStatus.TASK_ALREADY_RUNNING.code,
-        )
-
     try:
         with get_session_with_current_tenant() as db_session:
             attempt = get_index_attempt(db_session, index_attempt_id)
@@ -234,10 +203,7 @@ def _docfetching_task(
 
         # define a callback class
         callback = IndexingCallback(
-            os.getppid(),
             redis_connector,
-            lock,
-            r,
         )
 
         logger.info(
@@ -284,10 +250,6 @@ def _docfetching_task(
         except Exception:
             raise e
 
-    finally:
-        if lock.owned():
-            lock.release()
-
     logger.info(
         f"Indexing spawned task finished: attempt={index_attempt_id} "
         f"cc_pair={cc_pair_id} "
diff --git a/backend/onyx/background/celery/tasks/docprocessing/tasks.py b/backend/onyx/background/celery/tasks/docprocessing/tasks.py
index d4c2f7ecd6b..a71513a0f70 100644
--- a/backend/onyx/background/celery/tasks/docprocessing/tasks.py
+++ b/backend/onyx/background/celery/tasks/docprocessing/tasks.py
@@ -1,4 +1,3 @@
-import os
 import time
 import traceback
 from collections import defaultdict
@@ -7,7 +6,6 @@
 from datetime import timezone
 from http import HTTPStatus
 from typing import Any
-from typing import cast
 
 from celery import shared_task
 from celery import Task
@@ -29,12 +27,10 @@
     try_creating_docfetching_task,
 )
 from onyx.background.celery.tasks.models import DocProcessingContext
-from onyx.background.celery.tasks.models import IndexingWatchdogTerminalStatus
 from onyx.background.indexing.checkpointing_utils import cleanup_checkpoint
 from onyx.background.indexing.checkpointing_utils import (
     get_index_attempts_with_old_checkpoints,
 )
-from onyx.background.indexing.job_client import SimpleJobException
 from onyx.configs.app_configs import MANAGED_VESPA
 from onyx.configs.app_configs import VESPA_CLOUD_CERT_PATH
 from onyx.configs.app_configs import VESPA_CLOUD_KEY_PATH
@@ -1098,41 +1094,9 @@ def _docprocessing_task(
                 timeout=CELERY_INDEXING_LOCK_TIMEOUT,
                 thread_local=False,
             )
-            # set thread_local=False since we don't control what thread the indexing/pruning
-            # might run our callback with
-            per_batch_lock = cast(
-                RedisLock,
-                r.lock(
-                    redis_connector_index.lock_key_by_batch(batch_num),
-                    timeout=CELERY_INDEXING_LOCK_TIMEOUT,
-                    thread_local=False,
-                ),
-            )
-
-            acquired = per_batch_lock.acquire(blocking=False)
-            if not acquired:
-                logger.warning(
-                    f"Indexing batch task already running, exiting...: "
-                    f"index_attempt={index_attempt_id} "
-                    f"cc_pair={cc_pair_id} "
-                    f"search_settings={index_attempt.search_settings.id} "
-                    f"batch_num={batch_num}"
-                )
-
-                raise SimpleJobException(
-                    f"Indexing batch task already running, exiting...: "
-                    f"index_attempt={index_attempt_id} "
-                    f"cc_pair={cc_pair_id} "
-                    f"search_settings={index_attempt.search_settings.id} "
-                    f"batch_num={batch_num}",
-                    code=IndexingWatchdogTerminalStatus.TASK_ALREADY_RUNNING.code,
-                )
 
             callback = IndexingCallback(
-                os.getppid(),
                 redis_connector,
-                per_batch_lock,
-                r,
             )
             # TODO: right now this is the only thing the callback is used for,
             # probably there is a simpler way to handle pausing
@@ -1170,8 +1134,6 @@ def _docprocessing_task(
                 f"Processing {len(documents)} documents through indexing pipeline"
             )
 
-            per_batch_lock.reacquire()
-
             # real work happens here!
             index_pipeline_result = run_indexing_pipeline(
                 embedder=embedding_model,
@@ -1183,7 +1145,6 @@ def _docprocessing_task(
                 document_batch=documents,
                 index_attempt_metadata=index_attempt_metadata,
             )
-            per_batch_lock.reacquire()
 
         # Update batch completion and document counts atomically using database coordination
 
diff --git a/backend/onyx/background/celery/tasks/docprocessing/utils.py b/backend/onyx/background/celery/tasks/docprocessing/utils.py
index 99a32bf6604..173c72f1ead 100644
--- a/backend/onyx/background/celery/tasks/docprocessing/utils.py
+++ b/backend/onyx/background/celery/tasks/docprocessing/utils.py
@@ -110,18 +110,23 @@ def progress(self, tag: str, amount: int) -> None:
             raise
 
 
-class IndexingCallback(IndexingCallbackBase):
+# NOTE: we're in the process of removing all fences from indexing; this will
+# eventually no longer be used. For now, it is used only for connector pausing.
+class IndexingCallback(IndexingHeartbeatInterface):
     def __init__(
         self,
-        parent_pid: int,
         redis_connector: RedisConnector,
-        redis_lock: RedisLock,
-        redis_client: Redis,
     ):
-        super().__init__(parent_pid, redis_connector, redis_lock, redis_client)
+        self.redis_connector = redis_connector
+
+    def should_stop(self) -> bool:
+        # Check if the associated indexing attempt has been cancelled
+        # TODO: Pass index_attempt_id to the callback and check cancellation using the db
+        return bool(self.redis_connector.stop.fenced)
 
+    # included to satisfy old interface
     def progress(self, tag: str, amount: int) -> None:
-        super().progress(tag, amount)
+        pass
 
 
 # NOTE: The validate_indexing_fence and validate_indexing_fences functions have been removed
diff --git a/backend/onyx/background/indexing/run_docfetching.py b/backend/onyx/background/indexing/run_docfetching.py
index 28e70865c87..4a323129225 100644
--- a/backend/onyx/background/indexing/run_docfetching.py
+++ b/backend/onyx/background/indexing/run_docfetching.py
@@ -1113,15 +1113,8 @@ def connector_document_extraction(
                 # index being built. We want to populate it even for paused connectors
                 # Often paused connectors are sources that aren't updated frequently but the
                 # contents still need to be initially pulled.
-                if callback:
-                    if callback.should_stop():
-                        raise ConnectorStopSignal("Connector stop signal detected")
-
-                    # NOTE: this progress callback runs on every loop. We've seen cases
-                    # where we loop many times with no new documents and eventually time
-                    # out, so only doing the callback after indexing isn't sufficient.
-                    # TODO: change to doc extraction if it doesnt break things
-                    callback.progress("_run_indexing", 0)
+                if callback and callback.should_stop():
+                    raise ConnectorStopSignal("Connector stop signal detected")
 
                 # will exception if the connector/index attempt is marked as paused/failed
                 with get_session_with_current_tenant() as db_session_tmp:
diff --git a/backend/onyx/natural_language_processing/search_nlp_models.py b/backend/onyx/natural_language_processing/search_nlp_models.py
index c69bc34097e..db966e63777 100644
--- a/backend/onyx/natural_language_processing/search_nlp_models.py
+++ b/backend/onyx/natural_language_processing/search_nlp_models.py
@@ -258,8 +258,6 @@ def process_batch(
                     try:
                         result = future.result()
                         batch_results.append(result)
-                        if self.callback:
-                            self.callback.progress("_batch_encode_texts", 1)
                     except Exception as e:
                         logger.exception("Embedding model failed to process batch")
                         raise e
@@ -279,8 +277,6 @@ def process_batch(
                     request_id=request_id,
                 )
                 embeddings.extend(batch_embeddings)
-                if self.callback:
-                    self.callback.progress("_batch_encode_texts", 1)
 
         return embeddings
 
diff --git a/backend/onyx/redis/redis_connector_index.py b/backend/onyx/redis/redis_connector_index.py
index d4a28b45ebb..a4e2b149d40 100644
--- a/backend/onyx/redis/redis_connector_index.py
+++ b/backend/onyx/redis/redis_connector_index.py
@@ -79,9 +79,6 @@ def __init__(
             f"{self.TERMINATE_PREFIX}_{cc_pair_id}/{search_settings_id}"
         )
 
-    def lock_key_by_batch(self, batch_n: int) -> str:
-        return f"{self.per_worker_lock_key}/{batch_n}"
-
     def set_generator_complete(self, payload: int | None) -> None:
         if not payload:
             self.redis.delete(self.generator_complete_key)

From 31398ccfbf4f65d21b113505475d131759fa3333 Mon Sep 17 00:00:00 2001
From: Evan Lohn <evan@danswer.ai>
Date: Wed, 23 Jul 2025 18:21:13 -0700
Subject: [PATCH 30/78] attempt fix for broken excel files (#5071)

---
 backend/onyx/file_processing/extract_file_text.py | 7 ++++++-
 backend/requirements/default.txt                  | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/backend/onyx/file_processing/extract_file_text.py b/backend/onyx/file_processing/extract_file_text.py
index 61fde34da10..5dc1bd8ce93 100644
--- a/backend/onyx/file_processing/extract_file_text.py
+++ b/backend/onyx/file_processing/extract_file_text.py
@@ -83,6 +83,11 @@
     "image/webp",
 ]
 
+KNOWN_OPENPYXL_BUGS = [
+    "Value must be either numerical or a string containing a wildcard",
+    "File contains no valid workbook part",
+]
+
 
 class OnyxExtensionType(IntFlag):
     Plain = auto()
@@ -374,7 +379,7 @@ def xlsx_to_text(file: IO[Any], file_name: str = "") -> str:
             logger.warning(error_str)
         return ""
     except Exception as e:
-        if "File contains no valid workbook part" in str(e):
+        if any(s in str(e) for s in KNOWN_OPENPYXL_BUGS):
             logger.error(
                 f"Failed to extract text from {file_name or 'xlsx file'}. This happens due to a bug in openpyxl. {e}"
             )
diff --git a/backend/requirements/default.txt b/backend/requirements/default.txt
index da96a6e2bad..9fc4c982f1a 100644
--- a/backend/requirements/default.txt
+++ b/backend/requirements/default.txt
@@ -49,7 +49,7 @@ nltk==3.9.1
 Office365-REST-Python-Client==2.5.9
 oauthlib==3.2.2
 openai==1.75.0
-openpyxl==3.1.2
+openpyxl==3.0.10
 passlib==1.7.4
 playwright==1.41.2
 psutil==5.9.5

From fcc70a4f1033a99c9f091ee212d4c528aaaa0325 Mon Sep 17 00:00:00 2001
From: SubashMohan <subashmohan75@gmail.com>
Date: Thu, 24 Jul 2025 19:56:01 +0530
Subject: [PATCH 31/78] fix: sharepoint lg files issue (#5065)

* add SharePoint file size threshold check

* Implement retry logic for SharePoint queries to handle rate limiting and server error

* mypy fix

* add content none check

* remove unreachable code from retry logic in sharepoint connector
---
 backend/onyx/configs/app_configs.py           |  5 ++
 .../onyx/connectors/sharepoint/connector.py   | 73 ++++++++++++++++++-
 2 files changed, 75 insertions(+), 3 deletions(-)

diff --git a/backend/onyx/configs/app_configs.py b/backend/onyx/configs/app_configs.py
index 99a84f4a6d6..5876fe409d8 100644
--- a/backend/onyx/configs/app_configs.py
+++ b/backend/onyx/configs/app_configs.py
@@ -471,6 +471,11 @@ def get_current_tz_offset() -> int:
     os.environ.get("GOOGLE_DRIVE_CONNECTOR_SIZE_THRESHOLD", 10 * 1024 * 1024)
 )
 
+# Default size threshold for SharePoint files (20MB)
+SHAREPOINT_CONNECTOR_SIZE_THRESHOLD = int(
+    os.environ.get("SHAREPOINT_CONNECTOR_SIZE_THRESHOLD", 20 * 1024 * 1024)
+)
+
 JIRA_CONNECTOR_LABELS_TO_SKIP = [
     ignored_tag
     for ignored_tag in os.environ.get("JIRA_CONNECTOR_LABELS_TO_SKIP", "").split(",")
diff --git a/backend/onyx/connectors/sharepoint/connector.py b/backend/onyx/connectors/sharepoint/connector.py
index 87f9ea43a79..58e1a4d9806 100644
--- a/backend/onyx/connectors/sharepoint/connector.py
+++ b/backend/onyx/connectors/sharepoint/connector.py
@@ -1,5 +1,6 @@
 import io
 import os
+import time
 from collections.abc import Generator
 from datetime import datetime
 from datetime import timezone
@@ -11,9 +12,11 @@
 from office365.onedrive.driveitems.driveItem import DriveItem  # type: ignore
 from office365.onedrive.sites.site import Site  # type: ignore
 from office365.onedrive.sites.sites_with_root import SitesWithRoot  # type: ignore
+from office365.runtime.client_request import ClientRequestException  # type: ignore
 from pydantic import BaseModel
 
 from onyx.configs.app_configs import INDEX_BATCH_SIZE
+from onyx.configs.app_configs import SHAREPOINT_CONNECTOR_SIZE_THRESHOLD
 from onyx.configs.constants import DocumentSource
 from onyx.connectors.interfaces import GenerateDocumentsOutput
 from onyx.connectors.interfaces import LoadConnector
@@ -46,12 +49,72 @@ class SiteDescriptor(BaseModel):
     folder_path: str | None
 
 
+def _sleep_and_retry(query_obj: Any, method_name: str, max_retries: int = 3) -> Any:
+    """
+    Execute a SharePoint query with retry logic for rate limiting.
+    """
+    for attempt in range(max_retries + 1):
+        try:
+            return query_obj.execute_query()
+        except ClientRequestException as e:
+            if (
+                e.response
+                and e.response.status_code in [429, 503]
+                and attempt < max_retries
+            ):
+                logger.warning(
+                    f"Rate limit exceeded on {method_name}, attempt {attempt + 1}/{max_retries + 1}, sleeping and retrying"
+                )
+                retry_after = e.response.headers.get("Retry-After")
+                if retry_after:
+                    sleep_time = int(retry_after)
+                else:
+                    # Exponential backoff: 2^attempt * 5 seconds
+                    sleep_time = min(30, (2**attempt) * 5)
+
+                logger.info(f"Sleeping for {sleep_time} seconds before retry")
+                time.sleep(sleep_time)
+            else:
+                # Either not a rate limit error, or we've exhausted retries
+                if e.response and e.response.status_code == 429:
+                    logger.error(
+                        f"Rate limit retry exhausted for {method_name} after {max_retries} attempts"
+                    )
+                raise e
+
+
 def _convert_driveitem_to_document(
     driveitem: DriveItem,
     drive_name: str,
-) -> Document:
+) -> Document | None:
+    # Check file size before downloading
+    try:
+        size_value = getattr(driveitem, "size", None)
+        if size_value is not None:
+            file_size = int(size_value)
+            if file_size > SHAREPOINT_CONNECTOR_SIZE_THRESHOLD:
+                logger.warning(
+                    f"File '{driveitem.name}' exceeds size threshold of {SHAREPOINT_CONNECTOR_SIZE_THRESHOLD} bytes. "
+                    f"File size: {file_size} bytes. Skipping."
+                )
+                return None
+        else:
+            logger.warning(
+                f"Could not access file size for '{driveitem.name}' Proceeding with download."
+            )
+    except (ValueError, TypeError, AttributeError) as e:
+        logger.info(
+            f"Could not access file size for '{driveitem.name}': {e}. Proceeding with download."
+        )
+
+    # Proceed with download if size is acceptable or not available
+    content = _sleep_and_retry(driveitem.get_content(), "get_content")
+    if content is None:
+        logger.warning(f"Could not access content for '{driveitem.name}'")
+        return None
+
     file_text = extract_file_text(
-        file=io.BytesIO(driveitem.get_content().execute_query().value),
+        file=io.BytesIO(content.value),
         file_name=driveitem.name,
         break_on_unprocessable=False,
     )
@@ -275,7 +338,11 @@ def _fetch_from_sharepoint(
             driveitems = self._fetch_driveitems(site_descriptor, start=start, end=end)
             for driveitem, drive_name in driveitems:
                 logger.debug(f"Processing: {driveitem.web_url}")
-                doc_batch.append(_convert_driveitem_to_document(driveitem, drive_name))
+
+                # Convert driveitem to document with size checking
+                doc = _convert_driveitem_to_document(driveitem, drive_name)
+                if doc is not None:
+                    doc_batch.append(doc)
 
                 if len(doc_batch) >= self.batch_size:
                     yield doc_batch

From d1c06b343e3d1b341315f67f01a5a69763188521 Mon Sep 17 00:00:00 2001
From: Evan Lohn <evan@danswer.ai>
Date: Thu, 24 Jul 2025 11:15:07 -0700
Subject: [PATCH 32/78] add library to fall back to for tokenizing (#5078)

---
 backend/requirements/model_server.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/backend/requirements/model_server.txt b/backend/requirements/model_server.txt
index a183cb58cb9..24f16d2822b 100644
--- a/backend/requirements/model_server.txt
+++ b/backend/requirements/model_server.txt
@@ -9,6 +9,7 @@ pydantic==2.8.2
 retry==0.9.2
 safetensors==0.5.3
 sentence-transformers==4.0.2
+sentencepiece==0.2.0
 setfit==1.1.1
 torch==2.6.0
 transformers==4.49.0

From 2d53dfec7d375835112b4d3b6ab0f8eb241847ed Mon Sep 17 00:00:00 2001
From: Evan Lohn <evan@danswer.ai>
Date: Thu, 24 Jul 2025 17:42:12 -0700
Subject: [PATCH 33/78] fix: drive external links (#5079)

---
 backend/onyx/file_processing/extract_file_text.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/backend/onyx/file_processing/extract_file_text.py b/backend/onyx/file_processing/extract_file_text.py
index 5dc1bd8ce93..effd56225bc 100644
--- a/backend/onyx/file_processing/extract_file_text.py
+++ b/backend/onyx/file_processing/extract_file_text.py
@@ -341,8 +341,18 @@ def docx_to_text_and_images(
 
     for rel_id, rel in doc.part.rels.items():
         if "image" in rel.reltype:
-            # image is typically in rel.target_part.blob
-            image_bytes = rel.target_part.blob
+            # Skip images that are linked rather than embedded (TargetMode="External")
+            if getattr(rel, "is_external", False):
+                continue
+
+            try:
+                # image is typically in rel.target_part.blob
+                image_bytes = rel.target_part.blob
+            except ValueError:
+                # Safeguard against relationships that lack an internal target_part
+                # (e.g., external relationships or other anomalies)
+                continue
+
             image_name = rel.target_part.partname
             # store
             embedded_images.append((image_bytes, os.path.basename(str(image_name))))

From 5af4aa194c0068c2eaaee793510aab0000e31c13 Mon Sep 17 00:00:00 2001
From: Chris Weaver <chris@onyx.app>
Date: Thu, 24 Jul 2025 19:19:24 -0700
Subject: [PATCH 34/78] feat: support aspx files (#5068)

* Support aspx files

* Add fetching of site pages

* Improve

* Small enhancement

* more improvements

* Improvements

* Fix tests
---
 .../onyx/connectors/sharepoint/connector.py   | 279 +++++++++++++++++-
 .../sharepoint/test_sharepoint_connector.py   |  66 ++++-
 2 files changed, 326 insertions(+), 19 deletions(-)

diff --git a/backend/onyx/connectors/sharepoint/connector.py b/backend/onyx/connectors/sharepoint/connector.py
index 58e1a4d9806..cef0723ccc0 100644
--- a/backend/onyx/connectors/sharepoint/connector.py
+++ b/backend/onyx/connectors/sharepoint/connector.py
@@ -1,13 +1,17 @@
+import html
 import io
 import os
+import re
 import time
 from collections.abc import Generator
 from datetime import datetime
 from datetime import timezone
 from typing import Any
+from typing import cast
 from urllib.parse import unquote
 
 import msal  # type: ignore
+import requests
 from office365.graph_client import GraphClient  # type: ignore
 from office365.onedrive.driveitems.driveItem import DriveItem  # type: ignore
 from office365.onedrive.sites.site import Site  # type: ignore
@@ -33,6 +37,10 @@
 logger = setup_logger()
 
 
+ASPX_EXTENSION = ".aspx"
+REQUEST_TIMEOUT = 10
+
+
 class SiteDescriptor(BaseModel):
     """Data class for storing SharePoint site information.
 
@@ -136,11 +144,156 @@ def _convert_driveitem_to_document(
     return doc
 
 
+def _convert_sitepage_to_document(
+    site_page: dict[str, Any], site_name: str | None
+) -> Document:
+    """Convert a SharePoint site page to a Document object."""
+    # Extract text content from the site page
+    page_text = ""
+
+    # Get title and description
+    title = cast(str, site_page.get("title", ""))
+    description = cast(str, site_page.get("description", ""))
+
+    # Build the text content
+    if title:
+        page_text += f"# {title}\n\n"
+    if description:
+        page_text += f"{description}\n\n"
+
+    # Extract content from canvas layout if available
+    canvas_layout = site_page.get("canvasLayout", {})
+    if canvas_layout:
+        horizontal_sections = canvas_layout.get("horizontalSections", [])
+        for section in horizontal_sections:
+            columns = section.get("columns", [])
+            for column in columns:
+                webparts = column.get("webparts", [])
+                for webpart in webparts:
+                    # Extract text from different types of webparts
+                    webpart_type = webpart.get("@odata.type", "")
+
+                    # Extract text from text webparts
+                    if webpart_type == "#microsoft.graph.textWebPart":
+                        inner_html = webpart.get("innerHtml", "")
+                        if inner_html:
+                            # Basic HTML to text conversion
+                            # Remove HTML tags but preserve some structure
+                            text_content = re.sub(r"<br\s*/?>", "\n", inner_html)
+                            text_content = re.sub(r"<li>", "• ", text_content)
+                            text_content = re.sub(r"</li>", "\n", text_content)
+                            text_content = re.sub(
+                                r"<h[1-6][^>]*>", "\n## ", text_content
+                            )
+                            text_content = re.sub(r"</h[1-6]>", "\n", text_content)
+                            text_content = re.sub(r"<p[^>]*>", "\n", text_content)
+                            text_content = re.sub(r"</p>", "\n", text_content)
+                            text_content = re.sub(r"<[^>]+>", "", text_content)
+                            # Decode HTML entities
+                            text_content = html.unescape(text_content)
+                            # Clean up extra whitespace
+                            text_content = re.sub(
+                                r"\n\s*\n", "\n\n", text_content
+                            ).strip()
+                            if text_content:
+                                page_text += f"{text_content}\n\n"
+
+                    # Extract text from standard webparts
+                    elif webpart_type == "#microsoft.graph.standardWebPart":
+                        data = webpart.get("data", {})
+
+                        # Extract from serverProcessedContent
+                        server_content = data.get("serverProcessedContent", {})
+                        searchable_texts = server_content.get(
+                            "searchablePlainTexts", []
+                        )
+
+                        for text_item in searchable_texts:
+                            if isinstance(text_item, dict):
+                                key = text_item.get("key", "")
+                                value = text_item.get("value", "")
+                                if value:
+                                    # Add context based on key
+                                    if key == "title":
+                                        page_text += f"## {value}\n\n"
+                                    else:
+                                        page_text += f"{value}\n\n"
+
+                        # Extract description if available
+                        description = data.get("description", "")
+                        if description:
+                            page_text += f"{description}\n\n"
+
+                        # Extract title if available
+                        webpart_title = data.get("title", "")
+                        if webpart_title and webpart_title != description:
+                            page_text += f"## {webpart_title}\n\n"
+
+    page_text = page_text.strip()
+
+    # If no content extracted, use the title as fallback
+    if not page_text and title:
+        page_text = title
+
+    # Parse creation and modification info
+    created_datetime = site_page.get("createdDateTime")
+    if created_datetime:
+        if isinstance(created_datetime, str):
+            created_datetime = datetime.fromisoformat(
+                created_datetime.replace("Z", "+00:00")
+            )
+        elif not created_datetime.tzinfo:
+            created_datetime = created_datetime.replace(tzinfo=timezone.utc)
+
+    last_modified_datetime = site_page.get("lastModifiedDateTime")
+    if last_modified_datetime:
+        if isinstance(last_modified_datetime, str):
+            last_modified_datetime = datetime.fromisoformat(
+                last_modified_datetime.replace("Z", "+00:00")
+            )
+        elif not last_modified_datetime.tzinfo:
+            last_modified_datetime = last_modified_datetime.replace(tzinfo=timezone.utc)
+
+    # Extract owner information
+    primary_owners = []
+    created_by = site_page.get("createdBy", {}).get("user", {})
+    if created_by.get("displayName"):
+        primary_owners.append(
+            BasicExpertInfo(
+                display_name=created_by.get("displayName"),
+                email=created_by.get("email", ""),
+            )
+        )
+
+    web_url = site_page["webUrl"]
+    semantic_identifier = cast(str, site_page.get("name", title))
+    if semantic_identifier.endswith(ASPX_EXTENSION):
+        semantic_identifier = semantic_identifier[: -len(ASPX_EXTENSION)]
+
+    doc = Document(
+        id=site_page["id"],
+        sections=[TextSection(link=web_url, text=page_text)],
+        source=DocumentSource.SHAREPOINT,
+        semantic_identifier=semantic_identifier,
+        doc_updated_at=last_modified_datetime or created_datetime,
+        primary_owners=primary_owners,
+        metadata=(
+            {
+                "site": site_name,
+            }
+            if site_name
+            else {}
+        ),
+    )
+    return doc
+
+
 class SharepointConnector(LoadConnector, PollConnector):
     def __init__(
         self,
         batch_size: int = INDEX_BATCH_SIZE,
         sites: list[str] = [],
+        include_site_pages: bool = True,
     ) -> None:
         self.batch_size = batch_size
         self._graph_client: GraphClient | None = None
@@ -148,6 +301,7 @@ def __init__(
             sites
         )
         self.msal_app: msal.ConfidentialClientApplication | None = None
+        self.include_site_pages = include_site_pages
 
     @property
     def graph_client(self) -> GraphClient:
@@ -284,7 +438,7 @@ def _fetch_driveitems(
 
                 except Exception as e:
                     # Some drives might not be accessible
-                    logger.warning(f"Failed to process drive: {str(e)}")
+                    logger.warning(f"Failed to process drive '{drive.name}': {str(e)}")
 
         except Exception as e:
             err_str = str(e)
@@ -327,6 +481,74 @@ def _fetch_sites(self) -> list[SiteDescriptor]:
         ]
         return site_descriptors
 
+    def _fetch_site_pages(
+        self,
+        site_descriptor: SiteDescriptor,
+        start: datetime | None = None,
+        end: datetime | None = None,
+    ) -> list[dict[str, Any]]:
+        """Fetch SharePoint site pages (.aspx files) using the SharePoint Pages API."""
+        # Get the site to extract the site ID
+        site = self.graph_client.sites.get_by_url(site_descriptor.url)
+        site.execute_query()  # Execute the query to actually fetch the data
+        site_id = site.id
+
+        # Get the token acquisition function from the GraphClient
+        token_data = self._acquire_token()
+        access_token = token_data.get("access_token")
+        if not access_token:
+            raise RuntimeError("Failed to acquire access token")
+
+        # Construct the SharePoint Pages API endpoint
+        # Using API directly, since the Graph Client doesn't support the Pages API
+        pages_endpoint = f"https://graph.microsoft.com/v1.0/sites/{site_id}/pages/microsoft.graph.sitePage"
+
+        headers = {
+            "Authorization": f"Bearer {access_token}",
+            "Content-Type": "application/json",
+        }
+
+        # Add expand parameter to get canvas layout content
+        params = {"$expand": "canvasLayout"}
+
+        response = requests.get(
+            pages_endpoint, headers=headers, params=params, timeout=REQUEST_TIMEOUT
+        )
+        response.raise_for_status()
+        pages_data = response.json()
+        all_pages = pages_data.get("value", [])
+
+        # Handle pagination if there are more pages
+        while "@odata.nextLink" in pages_data:
+            next_url = pages_data["@odata.nextLink"]
+            response = requests.get(next_url, headers=headers, timeout=REQUEST_TIMEOUT)
+            response.raise_for_status()
+            pages_data = response.json()
+            all_pages.extend(pages_data.get("value", []))
+
+        logger.debug(f"Found {len(all_pages)} site pages in {site_descriptor.url}")
+
+        # Filter pages based on time window if specified
+        if start is not None or end is not None:
+            filtered_pages = []
+            for page in all_pages:
+                page_modified = page.get("lastModifiedDateTime")
+                if page_modified:
+                    if isinstance(page_modified, str):
+                        page_modified = datetime.fromisoformat(
+                            page_modified.replace("Z", "+00:00")
+                        )
+
+                    if start is not None and page_modified < start:
+                        continue
+                    if end is not None and page_modified > end:
+                        continue
+
+                filtered_pages.append(page)
+            all_pages = filtered_pages
+
+        return all_pages
+
     def _fetch_from_sharepoint(
         self, start: datetime | None = None, end: datetime | None = None
     ) -> GenerateDocumentsOutput:
@@ -335,6 +557,7 @@ def _fetch_from_sharepoint(
         # goes over all urls, converts them into Document objects and then yields them in batches
         doc_batch: list[Document] = []
         for site_descriptor in site_descriptors:
+            # Fetch regular documents from document libraries
             driveitems = self._fetch_driveitems(site_descriptor, start=start, end=end)
             for driveitem, drive_name in driveitems:
                 logger.debug(f"Processing: {driveitem.web_url}")
@@ -347,8 +570,47 @@ def _fetch_from_sharepoint(
                 if len(doc_batch) >= self.batch_size:
                     yield doc_batch
                     doc_batch = []
+
+            # Fetch SharePoint site pages (.aspx files)
+            # Only fetch site pages if a folder is not specified since this processing
+            # happens at a site-wide level + specifying a folder implies that the
+            # user probably isn't looking for site pages
+            specified_path = (
+                site_descriptor.folder_path is not None
+                or site_descriptor.drive_name is not None
+            )
+            if self.include_site_pages and not specified_path:
+                site_pages = self._fetch_site_pages(
+                    site_descriptor, start=start, end=end
+                )
+                for site_page in site_pages:
+                    logger.debug(
+                        f"Processing site page: {site_page.get('webUrl', site_page.get('name', 'Unknown'))}"
+                    )
+                    doc_batch.append(
+                        _convert_sitepage_to_document(
+                            site_page, site_descriptor.drive_name
+                        )
+                    )
+
+                    if len(doc_batch) >= self.batch_size:
+                        yield doc_batch
+                        doc_batch = []
+
         yield doc_batch
 
+    def _acquire_token(self) -> dict[str, Any]:
+        """
+        Acquire token via MSAL
+        """
+        if self.msal_app is None:
+            raise RuntimeError("MSAL app is not initialized")
+
+        token = self.msal_app.acquire_token_for_client(
+            scopes=["https://graph.microsoft.com/.default"]
+        )
+        return token
+
     def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
         sp_client_id = credentials["sp_client_id"]
         sp_client_secret = credentials["sp_client_secret"]
@@ -360,20 +622,7 @@ def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None
             client_id=sp_client_id,
             client_credential=sp_client_secret,
         )
-
-        def _acquire_token_func() -> dict[str, Any]:
-            """
-            Acquire token via MSAL
-            """
-            if self.msal_app is None:
-                raise RuntimeError("MSAL app is not initialized")
-
-            token = self.msal_app.acquire_token_for_client(
-                scopes=["https://graph.microsoft.com/.default"]
-            )
-            return token
-
-        self._graph_client = GraphClient(_acquire_token_func)
+        self._graph_client = GraphClient(self._acquire_token)
         return None
 
     def load_from_state(self) -> GenerateDocumentsOutput:
diff --git a/backend/tests/daily/connectors/sharepoint/test_sharepoint_connector.py b/backend/tests/daily/connectors/sharepoint/test_sharepoint_connector.py
index fabbab9616b..c76f1236316 100644
--- a/backend/tests/daily/connectors/sharepoint/test_sharepoint_connector.py
+++ b/backend/tests/daily/connectors/sharepoint/test_sharepoint_connector.py
@@ -85,12 +85,12 @@ def sharepoint_credentials() -> dict[str, str]:
     }
 
 
-def test_sharepoint_connector_all_sites(
+def test_sharepoint_connector_all_sites__docs_only(
     mock_get_unstructured_api_key: MagicMock,
     sharepoint_credentials: dict[str, str],
 ) -> None:
     # Initialize connector with no sites
-    connector = SharepointConnector()
+    connector = SharepointConnector(include_site_pages=False)
 
     # Load credentials
     connector.load_credentials(sharepoint_credentials)
@@ -135,12 +135,14 @@ def test_sharepoint_connector_specific_folder(
         verify_document_content(doc, expected)
 
 
-def test_sharepoint_connector_root_folder(
+def test_sharepoint_connector_root_folder__docs_only(
     mock_get_unstructured_api_key: MagicMock,
     sharepoint_credentials: dict[str, str],
 ) -> None:
     # Initialize connector with the base site URL
-    connector = SharepointConnector(sites=[os.environ["SHAREPOINT_SITE"]])
+    connector = SharepointConnector(
+        sites=[os.environ["SHAREPOINT_SITE"]], include_site_pages=False
+    )
 
     # Load credentials
     connector.load_credentials(sharepoint_credentials)
@@ -225,3 +227,59 @@ def test_sharepoint_connector_poll(
     verify_document_content(
         doc, [d for d in EXPECTED_DOCUMENTS if d.semantic_identifier == "test1.docx"][0]
     )
+
+
+def test_sharepoint_connector_pages(
+    mock_get_unstructured_api_key: MagicMock,
+    sharepoint_credentials: dict[str, str],
+) -> None:
+    # Initialize connector with the base site URL
+    connector = SharepointConnector(
+        sites=["https://danswerai.sharepoint.com/sites/sharepoint-tests-pages"]
+    )
+
+    # Load credentials
+    connector.load_credentials(sharepoint_credentials)
+
+    # Get documents within the time window
+    document_batches = list(connector.load_from_state())
+    found_documents: list[Document] = [
+        doc for batch in document_batches for doc in batch
+    ]
+
+    # Should only find CollabHome
+    assert len(found_documents) == 1, "Should only find one page"
+    doc = found_documents[0]
+    assert doc.semantic_identifier == "CollabHome"
+    verify_document_metadata(doc)
+    assert len(doc.sections) == 1
+    assert (
+        doc.sections[0].text
+        == """
+# Home
+
+Display recent news.
+
+## News
+
+Show recent activities from your site
+
+## Site activity
+
+## Quick links
+
+Learn about a team site
+
+Learn how to add a page
+
+Add links to important documents and pages.
+
+## Quick links
+
+Documents
+
+Add a document library
+
+## Document library
+""".strip()
+    )

From 29d3ad01dc019af75a6da71fc5c06aea522e571e Mon Sep 17 00:00:00 2001
From: Evan Lohn <evan@danswer.ai>
Date: Thu, 24 Jul 2025 19:18:35 -0700
Subject: [PATCH 35/78] attempt to fix parsing of tricky template files (#5080)

---
 backend/onyx/file_processing/extract_file_text.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/onyx/file_processing/extract_file_text.py b/backend/onyx/file_processing/extract_file_text.py
index effd56225bc..622559f895f 100644
--- a/backend/onyx/file_processing/extract_file_text.py
+++ b/backend/onyx/file_processing/extract_file_text.py
@@ -317,7 +317,7 @@ def docx_to_text_and_images(
 
     try:
         doc = docx.Document(file)
-    except BadZipFile as e:
+    except (BadZipFile, ValueError) as e:
         logger.warning(
             f"Failed to extract docx {file_name or 'docx file'}: {e}. Attempting to read as text file."
         )

From af0ac77a83f00642595b1951a92b3620c3b5378d Mon Sep 17 00:00:00 2001
From: Wenxi <wenxi@onyx.app>
Date: Fri, 25 Jul 2025 11:26:21 -0700
Subject: [PATCH 36/78] typo (#5082)

---
 web/src/app/chat/lib.tsx                         | 4 ++--
 web/src/app/chat/tools/constants.ts              | 2 +-
 web/src/components/assistants/AssistantCards.tsx | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/web/src/app/chat/lib.tsx b/web/src/app/chat/lib.tsx
index 3f4243d3ec9..a5c4498f5b7 100644
--- a/web/src/app/chat/lib.tsx
+++ b/web/src/app/chat/lib.tsx
@@ -34,7 +34,7 @@ import { SEARCH_PARAM_NAMES } from "./searchParams";
 import { Settings } from "../admin/settings/interfaces";
 import { INTERNET_SEARCH_TOOL_ID } from "./tools/constants";
 import { SEARCH_TOOL_ID } from "./tools/constants";
-import { IIMAGE_GENERATION_TOOL_ID } from "./tools/constants";
+import { IMAGE_GENERATION_TOOL_ID } from "./tools/constants";
 
 interface ChatRetentionInfo {
   chatRetentionDays: number;
@@ -666,7 +666,7 @@ export function personaIncludesRetrieval(
 export function personaIncludesImage(selectedPersona: MinimalPersonaSnapshot) {
   return selectedPersona.tools.some(
     (tool) =>
-      tool.in_code_tool_id && tool.in_code_tool_id == IIMAGE_GENERATION_TOOL_ID
+      tool.in_code_tool_id && tool.in_code_tool_id == IMAGE_GENERATION_TOOL_ID
   );
 }
 
diff --git a/web/src/app/chat/tools/constants.ts b/web/src/app/chat/tools/constants.ts
index 5f970b6f16d..a8a98bd9974 100644
--- a/web/src/app/chat/tools/constants.ts
+++ b/web/src/app/chat/tools/constants.ts
@@ -5,5 +5,5 @@ export const IMAGE_GENERATION_TOOL_NAME = "run_image_generation";
 
 // In-code tool IDs that also correspond to the tool's name when associated with a persona
 export const SEARCH_TOOL_ID = "SearchTool";
-export const IIMAGE_GENERATION_TOOL_ID = "ImageGenerationTool";
+export const IMAGE_GENERATION_TOOL_ID = "ImageGenerationTool";
 export const INTERNET_SEARCH_TOOL_ID = "InternetSearchTool";
diff --git a/web/src/components/assistants/AssistantCards.tsx b/web/src/components/assistants/AssistantCards.tsx
index 5e9df5a129f..8f8651b833f 100644
--- a/web/src/components/assistants/AssistantCards.tsx
+++ b/web/src/components/assistants/AssistantCards.tsx
@@ -6,7 +6,7 @@ import { FiImage, FiSearch } from "react-icons/fi";
 import { MdDragIndicator } from "react-icons/md";
 
 import { Badge } from "../ui/badge";
-import { IIMAGE_GENERATION_TOOL_ID } from "@/app/chat/tools/constants";
+import { IMAGE_GENERATION_TOOL_ID } from "@/app/chat/tools/constants";
 import { SEARCH_TOOL_ID } from "@/app/chat/tools/constants";
 
 export const AssistantCard = ({
@@ -27,7 +27,7 @@ export const AssistantCard = ({
             <span>Search</span>
           </>
         );
-      case IIMAGE_GENERATION_TOOL_ID:
+      case IMAGE_GENERATION_TOOL_ID:
         return (
           <>
             <FiImage className="h-3 w-3 my-auto" />

From 1564778e7ac31915cf484e9f75d09d12fba5dfc6 Mon Sep 17 00:00:00 2001
From: Evan Lohn <evan@danswer.ai>
Date: Fri, 25 Jul 2025 11:56:11 -0700
Subject: [PATCH 37/78] fix: preserve error traces (#5083)

---
 .../celery/tasks/docfetching/tasks.py         | 23 +++++++++++--------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/backend/onyx/background/celery/tasks/docfetching/tasks.py b/backend/onyx/background/celery/tasks/docfetching/tasks.py
index 8e5cd8e8f81..bac1cb367fc 100644
--- a/backend/onyx/background/celery/tasks/docfetching/tasks.py
+++ b/backend/onyx/background/celery/tasks/docfetching/tasks.py
@@ -552,15 +552,20 @@ def docfetching_proxy_task(
         # print with exception
         try:
             with get_session_with_current_tenant() as db_session:
-                failure_reason = (
-                    f"Spawned task exceptioned: exit_code={result.exit_code}"
-                )
-                mark_attempt_failed(
-                    ctx.index_attempt_id,
-                    db_session,
-                    failure_reason=failure_reason,
-                    full_exception_trace=result.exception_str,
-                )
+                attempt = get_index_attempt(db_session, ctx.index_attempt_id)
+
+                # only mark failures if not already terminal,
+                # otherwise we're overwriting potential real stack traces
+                if attempt and not attempt.status.is_terminal():
+                    failure_reason = (
+                        f"Spawned task exceptioned: exit_code={result.exit_code}"
+                    )
+                    mark_attempt_failed(
+                        ctx.index_attempt_id,
+                        db_session,
+                        failure_reason=failure_reason,
+                        full_exception_trace=result.exception_str,
+                    )
         except Exception:
             task_logger.exception(
                 log_builder.build(

From 801daef4bc5c41fdbe3b080b5f66763d65e4aa42 Mon Sep 17 00:00:00 2001
From: Evan Lohn <evan@danswer.ai>
Date: Fri, 25 Jul 2025 12:46:47 -0700
Subject: [PATCH 38/78] fix: sidebar ranges (#5084)

---
 web/src/app/chat/lib.tsx | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/web/src/app/chat/lib.tsx b/web/src/app/chat/lib.tsx
index a5c4498f5b7..6856db99275 100644
--- a/web/src/app/chat/lib.tsx
+++ b/web/src/app/chat/lib.tsx
@@ -36,6 +36,14 @@ import { INTERNET_SEARCH_TOOL_ID } from "./tools/constants";
 import { SEARCH_TOOL_ID } from "./tools/constants";
 import { IMAGE_GENERATION_TOOL_ID } from "./tools/constants";
 
+// Date range group constants
+export const DATE_RANGE_GROUPS = {
+  TODAY: "Today",
+  PREVIOUS_7_DAYS: "Previous 7 Days",
+  PREVIOUS_30_DAYS: "Previous 30 Days",
+  OVER_30_DAYS: "Over 30 Days",
+} as const;
+
 interface ChatRetentionInfo {
   chatRetentionDays: number;
   daysFromCreation: number;
@@ -421,10 +429,10 @@ export function groupSessionsByDateRange(chatSessions: ChatSession[]) {
   today.setHours(0, 0, 0, 0); // Set to start of today for accurate comparison
 
   const groups: Record<string, ChatSession[]> = {
-    Today: [],
-    "Previous 7 Days": [],
-    "Previous 30 days": [],
-    "Over 30 days": [],
+    [DATE_RANGE_GROUPS.TODAY]: [],
+    [DATE_RANGE_GROUPS.PREVIOUS_7_DAYS]: [],
+    [DATE_RANGE_GROUPS.PREVIOUS_30_DAYS]: [],
+    [DATE_RANGE_GROUPS.OVER_30_DAYS]: [],
   };
 
   chatSessions.forEach((chatSession) => {
@@ -434,22 +442,22 @@ export function groupSessionsByDateRange(chatSessions: ChatSession[]) {
     const diffDays = diffTime / (1000 * 3600 * 24); // Convert time difference to days
 
     if (diffDays < 1) {
-      const groups_today = groups["Today"];
+      const groups_today = groups[DATE_RANGE_GROUPS.TODAY];
       if (groups_today) {
         groups_today.push(chatSession);
       }
     } else if (diffDays <= 7) {
-      const groups_7 = groups["Previous 7 Days"];
+      const groups_7 = groups[DATE_RANGE_GROUPS.PREVIOUS_7_DAYS];
       if (groups_7) {
         groups_7.push(chatSession);
       }
     } else if (diffDays <= 30) {
-      const groups_30 = groups["Previous 30 Days"];
+      const groups_30 = groups[DATE_RANGE_GROUPS.PREVIOUS_30_DAYS];
       if (groups_30) {
         groups_30.push(chatSession);
       }
     } else {
-      const groups_over_30 = groups["Over 30 days"];
+      const groups_over_30 = groups[DATE_RANGE_GROUPS.OVER_30_DAYS];
       if (groups_over_30) {
         groups_over_30.push(chatSession);
       }

From e492d6ddae2a3159cd017d2dc3e33185413aeba1 Mon Sep 17 00:00:00 2001
From: Wenxi <wenxi@onyx.app>
Date: Fri, 25 Jul 2025 14:26:02 -0700
Subject: [PATCH 39/78] onyx metadata minio fix + permissive unstructured fail
 (#5085)

---
 backend/onyx/connectors/file/connector.py     |  2 +-
 .../onyx/file_processing/extract_file_text.py | 25 ++++++++-----------
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/backend/onyx/connectors/file/connector.py b/backend/onyx/connectors/file/connector.py
index 132b4ecc48a..7f8ae80d204 100644
--- a/backend/onyx/connectors/file/connector.py
+++ b/backend/onyx/connectors/file/connector.py
@@ -260,7 +260,7 @@ def load_from_state(self) -> GenerateDocumentsOutput:
                 logger.warning(f"No file record found for '{file_id}' in PG; skipping.")
                 continue
 
-            metadata = self._get_file_metadata(file_id)
+            metadata = self._get_file_metadata(file_record.display_name)
             file_io = file_store.read_file(file_id=file_id, mode="b")
             new_docs = _process_file(
                 file_id=file_id,
diff --git a/backend/onyx/file_processing/extract_file_text.py b/backend/onyx/file_processing/extract_file_text.py
index 622559f895f..8f2a7708350 100644
--- a/backend/onyx/file_processing/extract_file_text.py
+++ b/backend/onyx/file_processing/extract_file_text.py
@@ -532,22 +532,27 @@ def extract_text_and_images(
     Primary new function for the updated connector.
     Returns structured extraction result with text content, embedded images, and metadata.
     """
+    file.seek(0)
 
-    try:
-        # Attempt unstructured if env var is set
-        if get_unstructured_api_key():
-            # If the user doesn't want embedded images, unstructured is fine
-            file.seek(0)
+    if get_unstructured_api_key():
+        try:
             text_content = unstructured_to_text(file, file_name)
             return ExtractionResult(
                 text_content=text_content, embedded_images=[], metadata={}
             )
+        except Exception as e:
+            logger.error(
+                f"Failed to process with Unstructured: {str(e)}. "
+                "Falling back to normal processing."
+            )
+            file.seek(0)  # Reset file pointer just in case
 
+    # Default processing
+    try:
         extension = get_file_ext(file_name)
 
         # docx example for embedded images
         if extension == ".docx":
-            file.seek(0)
             text_content, images = docx_to_text_and_images(file)
             return ExtractionResult(
                 text_content=text_content, embedded_images=images, metadata={}
@@ -556,7 +561,6 @@ def extract_text_and_images(
         # PDF example: we do not show complicated PDF image extraction here
         # so we simply extract text for now and skip images.
         if extension == ".pdf":
-            file.seek(0)
             text_content, pdf_metadata, images = read_pdf_file(
                 file,
                 pdf_pass,
@@ -569,7 +573,6 @@ def extract_text_and_images(
         # For PPTX, XLSX, EML, etc., we do not show embedded image logic here.
         # You can do something similar to docx if needed.
         if extension == ".pptx":
-            file.seek(0)
             return ExtractionResult(
                 text_content=pptx_to_text(file, file_name=file_name),
                 embedded_images=[],
@@ -577,7 +580,6 @@ def extract_text_and_images(
             )
 
         if extension == ".xlsx":
-            file.seek(0)
             return ExtractionResult(
                 text_content=xlsx_to_text(file, file_name=file_name),
                 embedded_images=[],
@@ -585,19 +587,16 @@ def extract_text_and_images(
             )
 
         if extension == ".eml":
-            file.seek(0)
             return ExtractionResult(
                 text_content=eml_to_text(file), embedded_images=[], metadata={}
             )
 
         if extension == ".epub":
-            file.seek(0)
             return ExtractionResult(
                 text_content=epub_to_text(file), embedded_images=[], metadata={}
             )
 
         if extension == ".html":
-            file.seek(0)
             return ExtractionResult(
                 text_content=parse_html_page_basic(file),
                 embedded_images=[],
@@ -606,7 +605,6 @@ def extract_text_and_images(
 
         # If we reach here and it's a recognized text extension
         if is_text_file_extension(file_name):
-            file.seek(0)
             encoding = detect_encoding(file)
             text_content_raw, file_metadata = read_text_file(
                 file, encoding=encoding, ignore_onyx_metadata=False
@@ -619,7 +617,6 @@ def extract_text_and_images(
 
         # If it's an image file or something else, we do not parse embedded images from them
         # just return empty text
-        file.seek(0)
         return ExtractionResult(text_content="", embedded_images=[], metadata={})
 
     except Exception as e:

From addf25a61854d294947084d69e41d84cd6ad018d Mon Sep 17 00:00:00 2001
From: Evan Lohn <evan@danswer.ai>
Date: Sat, 26 Jul 2025 15:29:43 -0700
Subject: [PATCH 40/78] feat: pruning freq (#5097)

* pruning frequency increase

* add logs
---
 .../onyx/background/celery/tasks/pruning/tasks.py | 15 +++++++++------
 backend/onyx/redis/redis_document_set.py          |  5 +----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/backend/onyx/background/celery/tasks/pruning/tasks.py b/backend/onyx/background/celery/tasks/pruning/tasks.py
index a84fb19f702..71156a96e36 100644
--- a/backend/onyx/background/celery/tasks/pruning/tasks.py
+++ b/backend/onyx/background/celery/tasks/pruning/tasks.py
@@ -70,9 +70,9 @@
 def _get_pruning_block_expiration() -> int:
     """
     Compute the expiration time for the pruning block signal.
-    Base expiration is 3600 seconds (1 hour), multiplied by the beat multiplier only in MULTI_TENANT mode.
+    Base expiration is 60 seconds (1 minute), multiplied by the beat multiplier only in MULTI_TENANT mode.
     """
-    base_expiration = 3600  # seconds
+    base_expiration = 60  # seconds
 
     if not MULTI_TENANT:
         return base_expiration
@@ -145,10 +145,7 @@ def _is_pruning_due(cc_pair: ConnectorCredentialPair) -> bool:
         last_pruned = cc_pair.connector.time_created
 
     next_prune = last_pruned + timedelta(seconds=cc_pair.connector.prune_freq)
-    if datetime.now(timezone.utc) < next_prune:
-        return False
-
-    return True
+    return datetime.now(timezone.utc) >= next_prune
 
 
 @shared_task(
@@ -280,6 +277,9 @@ def try_creating_prune_generator_task(
     if not ALLOW_SIMULTANEOUS_PRUNING:
         count = redis_connector.prune.get_active_task_count()
         if count > 0:
+            logger.info(
+                f"try_creating_prune_generator_task: cc_pair={cc_pair.id} no simultaneous pruning allowed"
+            )
             return None
 
     LOCK_TIMEOUT = 30
@@ -293,6 +293,9 @@ def try_creating_prune_generator_task(
 
     acquired = lock.acquire(blocking_timeout=LOCK_TIMEOUT / 2)
     if not acquired:
+        logger.info(
+            f"try_creating_prune_generator_task: cc_pair={cc_pair.id} lock not acquired"
+        )
         return None
 
     try:
diff --git a/backend/onyx/redis/redis_document_set.py b/backend/onyx/redis/redis_document_set.py
index a9c612a5514..4804e5a028b 100644
--- a/backend/onyx/redis/redis_document_set.py
+++ b/backend/onyx/redis/redis_document_set.py
@@ -28,10 +28,7 @@ def __init__(self, tenant_id: str, id: int) -> None:
 
     @property
     def fenced(self) -> bool:
-        if self.redis.exists(self.fence_key):
-            return True
-
-        return False
+        return bool(self.redis.exists(self.fence_key))
 
     def set_fence(self, payload: int | None) -> None:
         if payload is None:

From 26afb77bcbd57e1cf2e273784abd9ac8f30cb4ae Mon Sep 17 00:00:00 2001
From: justin-tahara <justintahara@gmail.com>
Date: Mon, 28 Jul 2025 10:38:48 -0700
Subject: [PATCH 41/78] [Vespa] Update to optimized configuration

---
 .../vespa/app_config/cloud-services.xml.jinja               | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/backend/ee/onyx/document_index/vespa/app_config/cloud-services.xml.jinja b/backend/ee/onyx/document_index/vespa/app_config/cloud-services.xml.jinja
index 92c019bed1e..e9185d13418 100644
--- a/backend/ee/onyx/document_index/vespa/app_config/cloud-services.xml.jinja
+++ b/backend/ee/onyx/document_index/vespa/app_config/cloud-services.xml.jinja
@@ -18,9 +18,9 @@
             <!-- <document type="danswer_chunk" mode="index" /> -->
 {{ document_elements }}
         </documents>
-        <nodes count="75">
-            <resources vcpu="8.0" memory="64.0Gb" architecture="arm64" storage-type="local"
-                disk="474.0Gb" />
+        <nodes count="60">
+            <resources vcpu="8.0" memory="128.0Gb" architecture="arm64" storage-type="local"
+                disk="475.0Gb" />
         </nodes>
         <engine>
             <proton>

From 4e892cafb92836a17075eb8643d018509cff93af Mon Sep 17 00:00:00 2001
From: justin-tahara <justintahara@gmail.com>
Date: Mon, 28 Jul 2025 10:42:31 -0700
Subject: [PATCH 42/78] Let's do this properly

---
 .../vespa/app_config/cloud-services.xml.jinja               | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/backend/ee/onyx/document_index/vespa/app_config/cloud-services.xml.jinja b/backend/ee/onyx/document_index/vespa/app_config/cloud-services.xml.jinja
index e9185d13418..92c019bed1e 100644
--- a/backend/ee/onyx/document_index/vespa/app_config/cloud-services.xml.jinja
+++ b/backend/ee/onyx/document_index/vespa/app_config/cloud-services.xml.jinja
@@ -18,9 +18,9 @@
             <!-- <document type="danswer_chunk" mode="index" /> -->
 {{ document_elements }}
         </documents>
-        <nodes count="60">
-            <resources vcpu="8.0" memory="128.0Gb" architecture="arm64" storage-type="local"
-                disk="475.0Gb" />
+        <nodes count="75">
+            <resources vcpu="8.0" memory="64.0Gb" architecture="arm64" storage-type="local"
+                disk="474.0Gb" />
         </nodes>
         <engine>
             <proton>

From 4d92bd780c8c44e5c367b5d86f5feeeb118ce430 Mon Sep 17 00:00:00 2001
From: Justin Tahara <105671973+justin-tahara@users.noreply.github.com>
Date: Mon, 28 Jul 2025 13:42:31 -0700
Subject: [PATCH 43/78] [Vespa] Update to optimized configuration pt.2 (#5113)

---
 .../vespa/app_config/cloud-services.xml.jinja               | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/backend/ee/onyx/document_index/vespa/app_config/cloud-services.xml.jinja b/backend/ee/onyx/document_index/vespa/app_config/cloud-services.xml.jinja
index 92c019bed1e..e9185d13418 100644
--- a/backend/ee/onyx/document_index/vespa/app_config/cloud-services.xml.jinja
+++ b/backend/ee/onyx/document_index/vespa/app_config/cloud-services.xml.jinja
@@ -18,9 +18,9 @@
             <!-- <document type="danswer_chunk" mode="index" /> -->
 {{ document_elements }}
         </documents>
-        <nodes count="75">
-            <resources vcpu="8.0" memory="64.0Gb" architecture="arm64" storage-type="local"
-                disk="474.0Gb" />
+        <nodes count="60">
+            <resources vcpu="8.0" memory="128.0Gb" architecture="arm64" storage-type="local"
+                disk="475.0Gb" />
         </nodes>
         <engine>
             <proton>

From 7addaf2c6819fac844e4e952e35e97b7ce87bfd8 Mon Sep 17 00:00:00 2001
From: Justin Tahara <105671973+justin-tahara@users.noreply.github.com>
Date: Tue, 29 Jul 2025 16:31:35 -0700
Subject: [PATCH 44/78] feat(infra): Update helm version after new feature
 (#5120)

---
 deployment/helm/charts/onyx/Chart.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deployment/helm/charts/onyx/Chart.yaml b/deployment/helm/charts/onyx/Chart.yaml
index 7f5f66674b2..109113f89bd 100644
--- a/deployment/helm/charts/onyx/Chart.yaml
+++ b/deployment/helm/charts/onyx/Chart.yaml
@@ -5,7 +5,7 @@ home: https://www.onyx.app/
 sources:
   - "https://github.com/onyx-dot-app/onyx"
 type: application
-version: 0.2.2
+version: 0.2.3
 appVersion: latest
 annotations:
   category: Productivity

From 506b5b32940d87a0e1f902325f9e029118ccd322 Mon Sep 17 00:00:00 2001
From: SubashMohan <subashmohan75@gmail.com>
Date: Wed, 30 Jul 2025 08:12:18 +0530
Subject: [PATCH 45/78] Feature/GitHub permission sync (#4996)

* github perm sync initial draft

* introduce github  doc sync and perm sync

* remove specific start time check

* Refactor GitHub connector to use SlimCheckpointOutputWrapper for improved document handling

* Update GitHub sync frequency defaults from 30 minutes to 5 minutes

* Add stop signal handling and progress reporting in GitHub document sync

* Refactor tests for Confluence and Google Drive connectors to use a mock fetch function for document access

* change the doc_sync approach

* add static typing for ocument columns and where clause

* remove prefix logic in connector runner

* mypy fix

* code review changes

* mypy fix

* fix review comments

* add sort order

* Implement merge heads migration for Alembic and update Confluence and Google Drive test

* github unit tests fix

* delete merge head and rebase the docmetadata field migration

---------

Co-authored-by: Subash <subash@onyx.app>
---
 ...dd_doc_metadata_field_in_document_model.py |  30 ++
 .../tasks/doc_permission_syncing/tasks.py     |  24 +-
 backend/ee/onyx/configs/app_configs.py        |  13 +
 .../confluence/doc_sync.py                    |   4 +-
 .../external_permissions/github/doc_sync.py   | 294 +++++++++++
 .../external_permissions/github/group_sync.py |  46 ++
 .../onyx/external_permissions/github/utils.py | 488 ++++++++++++++++++
 .../external_permissions/gmail/doc_sync.py    |   2 +
 .../google_drive/doc_sync.py                  |   2 +
 .../external_permissions/jira/doc_sync.py     |   4 +-
 .../external_permissions/perm_sync_types.py   |  33 +-
 .../external_permissions/slack/doc_sync.py    |   2 +
 .../onyx/external_permissions/sync_params.py  |  18 +
 .../external_permissions/teams/doc_sync.py    |   4 +-
 backend/ee/onyx/external_permissions/utils.py |   8 +-
 backend/onyx/connectors/connector_runner.py   |   5 +-
 backend/onyx/connectors/github/connector.py   | 166 +++---
 backend/onyx/connectors/github/models.py      |  17 +
 .../connectors/github/rate_limit_utils.py     |  25 +
 backend/onyx/connectors/github/utils.py       |  63 +++
 backend/onyx/connectors/models.py             |   1 +
 backend/onyx/db/document.py                   |  51 +-
 backend/onyx/db/models.py                     |   4 +
 backend/onyx/db/utils.py                      |  13 +
 backend/onyx/document_index/interfaces.py     |   1 +
 backend/onyx/indexing/indexing_pipeline.py    |   1 +
 .../test_confluence_permissions_basic.py      |  14 +-
 .../google_drive/test_drive_perm_sync.py      |  17 +-
 backend/tests/daily/connectors/utils.py       |   4 +-
 .../github/test_github_checkpointing.py       |  15 +-
 .../jira/test_jira_permission_sync.py         |   7 +
 .../lib/connectors/AutoSyncOptionFields.tsx   |   1 +
 web/src/lib/types.ts                          |   1 +
 33 files changed, 1282 insertions(+), 96 deletions(-)
 create mode 100644 backend/alembic/versions/3fc5d75723b3_add_doc_metadata_field_in_document_model.py
 create mode 100644 backend/ee/onyx/external_permissions/github/doc_sync.py
 create mode 100644 backend/ee/onyx/external_permissions/github/group_sync.py
 create mode 100644 backend/ee/onyx/external_permissions/github/utils.py
 create mode 100644 backend/onyx/connectors/github/models.py
 create mode 100644 backend/onyx/connectors/github/rate_limit_utils.py
 create mode 100644 backend/onyx/connectors/github/utils.py

diff --git a/backend/alembic/versions/3fc5d75723b3_add_doc_metadata_field_in_document_model.py b/backend/alembic/versions/3fc5d75723b3_add_doc_metadata_field_in_document_model.py
new file mode 100644
index 00000000000..12741c20d9b
--- /dev/null
+++ b/backend/alembic/versions/3fc5d75723b3_add_doc_metadata_field_in_document_model.py
@@ -0,0 +1,30 @@
+"""add_doc_metadata_field_in_document_model
+
+Revision ID: 3fc5d75723b3
+Revises: 2f95e36923e6
+Create Date: 2025-07-28 18:45:37.985406
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = "3fc5d75723b3"
+down_revision = "2f95e36923e6"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "document",
+        sa.Column(
+            "doc_metadata", postgresql.JSONB(astext_type=sa.Text()), nullable=True
+        ),
+    )
+
+
+def downgrade() -> None:
+    op.drop_column("document", "doc_metadata")
diff --git a/backend/ee/onyx/background/celery/tasks/doc_permission_syncing/tasks.py b/backend/ee/onyx/background/celery/tasks/doc_permission_syncing/tasks.py
index e5c66e27a1f..a93bc077f00 100644
--- a/backend/ee/onyx/background/celery/tasks/doc_permission_syncing/tasks.py
+++ b/backend/ee/onyx/background/celery/tasks/doc_permission_syncing/tasks.py
@@ -47,6 +47,7 @@
 from onyx.db.connector import mark_cc_pair_as_permissions_synced
 from onyx.db.connector_credential_pair import get_connector_credential_pair_from_id
 from onyx.db.document import get_document_ids_for_connector_credential_pair
+from onyx.db.document import get_documents_for_connector_credential_pair_limited_columns
 from onyx.db.document import upsert_document_by_connector_credential_pair
 from onyx.db.engine.sql_engine import get_session_with_current_tenant
 from onyx.db.engine.sql_engine import get_session_with_tenant
@@ -58,7 +59,9 @@
 from onyx.db.sync_record import insert_sync_record
 from onyx.db.sync_record import update_sync_record_status
 from onyx.db.users import batch_add_ext_perm_user_if_not_exists
+from onyx.db.utils import DocumentRow
 from onyx.db.utils import is_retryable_sqlalchemy_error
+from onyx.db.utils import SortOrder
 from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
 from onyx.redis.redis_connector import RedisConnector
 from onyx.redis.redis_connector_doc_perm_sync import RedisConnectorPermissionSync
@@ -498,16 +501,31 @@ def connector_permission_sync_generator_task(
             # this is can be used to determine documents that are "missing" and thus
             # should no longer be accessible. The decision as to whether we should find
             # every document during the doc sync process is connector-specific.
-            def fetch_all_existing_docs_fn() -> list[str]:
-                return get_document_ids_for_connector_credential_pair(
+            def fetch_all_existing_docs_fn(
+                sort_order: SortOrder | None = None,
+            ) -> list[DocumentRow]:
+                result = get_documents_for_connector_credential_pair_limited_columns(
                     db_session=db_session,
                     connector_id=cc_pair.connector.id,
                     credential_id=cc_pair.credential.id,
+                    sort_order=sort_order,
                 )
+                return list(result)
+
+            def fetch_all_existing_docs_ids_fn() -> list[str]:
+                result = get_document_ids_for_connector_credential_pair(
+                    db_session=db_session,
+                    connector_id=cc_pair.connector.id,
+                    credential_id=cc_pair.credential.id,
+                )
+                return result
 
             doc_sync_func = sync_config.doc_sync_config.doc_sync_func
             document_external_accesses = doc_sync_func(
-                cc_pair, fetch_all_existing_docs_fn, callback
+                cc_pair,
+                fetch_all_existing_docs_fn,
+                fetch_all_existing_docs_ids_fn,
+                callback,
             )
 
             task_logger.info(
diff --git a/backend/ee/onyx/configs/app_configs.py b/backend/ee/onyx/configs/app_configs.py
index a84247e6844..3a2cc3cc905 100644
--- a/backend/ee/onyx/configs/app_configs.py
+++ b/backend/ee/onyx/configs/app_configs.py
@@ -71,6 +71,19 @@
 )
 
 
+#####
+# GitHub
+#####
+# In seconds, default is 5 minutes
+GITHUB_PERMISSION_DOC_SYNC_FREQUENCY = int(
+    os.environ.get("GITHUB_PERMISSION_DOC_SYNC_FREQUENCY") or 5 * 60
+)
+# In seconds, default is 5 minutes
+GITHUB_PERMISSION_GROUP_SYNC_FREQUENCY = int(
+    os.environ.get("GITHUB_PERMISSION_GROUP_SYNC_FREQUENCY") or 5 * 60
+)
+
+
 #####
 # Slack
 #####
diff --git a/backend/ee/onyx/external_permissions/confluence/doc_sync.py b/backend/ee/onyx/external_permissions/confluence/doc_sync.py
index 26c10e8b573..685db904d94 100644
--- a/backend/ee/onyx/external_permissions/confluence/doc_sync.py
+++ b/backend/ee/onyx/external_permissions/confluence/doc_sync.py
@@ -6,6 +6,7 @@
 from collections.abc import Generator
 
 from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsFunction
+from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsIdsFunction
 from ee.onyx.external_permissions.utils import generic_doc_sync
 from onyx.access.models import DocExternalAccess
 from onyx.configs.constants import DocumentSource
@@ -25,6 +26,7 @@
 def confluence_doc_sync(
     cc_pair: ConnectorCredentialPair,
     fetch_all_existing_docs_fn: FetchAllDocumentsFunction,
+    fetch_all_existing_docs_ids_fn: FetchAllDocumentsIdsFunction,
     callback: IndexingHeartbeatInterface | None,
 ) -> Generator[DocExternalAccess, None, None]:
     """
@@ -43,7 +45,7 @@ def confluence_doc_sync(
 
     yield from generic_doc_sync(
         cc_pair=cc_pair,
-        fetch_all_existing_docs_fn=fetch_all_existing_docs_fn,
+        fetch_all_existing_docs_ids_fn=fetch_all_existing_docs_ids_fn,
         callback=callback,
         doc_source=DocumentSource.CONFLUENCE,
         slim_connector=confluence_connector,
diff --git a/backend/ee/onyx/external_permissions/github/doc_sync.py b/backend/ee/onyx/external_permissions/github/doc_sync.py
new file mode 100644
index 00000000000..1d262ad4743
--- /dev/null
+++ b/backend/ee/onyx/external_permissions/github/doc_sync.py
@@ -0,0 +1,294 @@
+import json
+from collections.abc import Generator
+
+from github import Github
+from github.Repository import Repository
+
+from ee.onyx.external_permissions.github.utils import fetch_repository_team_slugs
+from ee.onyx.external_permissions.github.utils import form_collaborators_group_id
+from ee.onyx.external_permissions.github.utils import form_organization_group_id
+from ee.onyx.external_permissions.github.utils import (
+    form_outside_collaborators_group_id,
+)
+from ee.onyx.external_permissions.github.utils import get_external_access_permission
+from ee.onyx.external_permissions.github.utils import get_repository_visibility
+from ee.onyx.external_permissions.github.utils import GitHubVisibility
+from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsFunction
+from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsIdsFunction
+from onyx.access.models import DocExternalAccess
+from onyx.access.utils import build_ext_group_name_for_onyx
+from onyx.configs.constants import DocumentSource
+from onyx.connectors.github.connector import DocMetadata
+from onyx.connectors.github.connector import GithubConnector
+from onyx.db.models import ConnectorCredentialPair
+from onyx.db.utils import DocumentRow
+from onyx.db.utils import SortOrder
+from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+GITHUB_DOC_SYNC_LABEL = "github_doc_sync"
+
+
+def github_doc_sync(
+    cc_pair: ConnectorCredentialPair,
+    fetch_all_existing_docs_fn: FetchAllDocumentsFunction,
+    fetch_all_existing_docs_ids_fn: FetchAllDocumentsIdsFunction,
+    callback: IndexingHeartbeatInterface | None = None,
+) -> Generator[DocExternalAccess, None, None]:
+    """
+    Sync GitHub documents with external access permissions.
+
+    This function checks each repository for visibility/team changes and updates
+    document permissions accordingly without using checkpoints.
+    """
+    logger.info(f"Starting GitHub document sync for CC pair ID: {cc_pair.id}")
+
+    # Initialize GitHub connector with credentials
+    github_connector: GithubConnector = GithubConnector(
+        **cc_pair.connector.connector_specific_config
+    )
+
+    github_connector.load_credentials(cc_pair.credential.credential_json)
+    logger.info("GitHub connector credentials loaded successfully")
+
+    if not github_connector.github_client:
+        logger.error("GitHub client initialization failed")
+        raise ValueError("github_client is required")
+
+    # Get all repositories from GitHub API
+    logger.info("Fetching all repositories from GitHub API")
+    try:
+        repos = []
+        if github_connector.repositories:
+            if "," in github_connector.repositories:
+                # Multiple repositories specified
+                repos = github_connector.get_github_repos(
+                    github_connector.github_client
+                )
+            else:
+                # Single repository
+                repos = [
+                    github_connector.get_github_repo(github_connector.github_client)
+                ]
+        else:
+            # All repositories
+            repos = github_connector.get_all_repos(github_connector.github_client)
+
+        logger.info(f"Found {len(repos)} repositories to check")
+    except Exception as e:
+        logger.error(f"Failed to fetch repositories: {e}")
+        raise
+
+    repo_to_doc_list_map: dict[str, list[DocumentRow]] = {}
+    # sort order is ascending because we want to get the oldest documents first
+    existing_docs: list[DocumentRow] = fetch_all_existing_docs_fn(
+        sort_order=SortOrder.ASC
+    )
+    logger.info(f"Found {len(existing_docs)} documents to check")
+    for doc in existing_docs:
+        try:
+            doc_metadata = DocMetadata.model_validate_json(json.dumps(doc.doc_metadata))
+            if doc_metadata.repo not in repo_to_doc_list_map:
+                repo_to_doc_list_map[doc_metadata.repo] = []
+            repo_to_doc_list_map[doc_metadata.repo].append(doc)
+        except Exception as e:
+            logger.error(f"Failed to parse doc metadata: {e} for doc {doc.id}")
+            continue
+    logger.info(f"Found {len(repo_to_doc_list_map)} documents to check")
+    # Process each repository individually
+    for repo in repos:
+        try:
+            logger.info(f"Processing repository: {repo.id} (name: {repo.name})")
+            repo_doc_list: list[DocumentRow] = repo_to_doc_list_map.get(
+                repo.full_name, []
+            )
+            if not repo_doc_list:
+                logger.warning(
+                    f"No documents found for repository {repo.id} ({repo.name})"
+                )
+                continue
+
+            current_external_group_ids = repo_doc_list[0].external_user_group_ids or []
+            # Check if repository has any permission changes
+            has_changes = _check_repository_for_changes(
+                repo=repo,
+                github_client=github_connector.github_client,
+                current_external_group_ids=current_external_group_ids,
+            )
+
+            if has_changes:
+                logger.info(
+                    f"Repository {repo.id} ({repo.name}) has changes, updating documents"
+                )
+
+                # Get new external access permissions for this repository
+                new_external_access = get_external_access_permission(
+                    repo, github_connector.github_client
+                )
+
+                logger.info(
+                    f"Found {len(repo_doc_list)} documents for repository {repo.full_name}"
+                )
+
+                # Yield updated external access for each document
+                for doc in repo_doc_list:
+                    if callback:
+                        callback.progress(GITHUB_DOC_SYNC_LABEL, 1)
+
+                    yield DocExternalAccess(
+                        doc_id=doc.id,
+                        external_access=new_external_access,
+                    )
+            else:
+                logger.info(
+                    f"Repository {repo.id} ({repo.name}) has no changes, skipping"
+                )
+        except Exception as e:
+            logger.error(f"Error processing repository {repo.id} ({repo.name}): {e}")
+
+    logger.info(f"GitHub document sync completed for CC pair ID: {cc_pair.id}")
+
+
+def _check_repository_for_changes(
+    repo: Repository,
+    github_client: Github,
+    current_external_group_ids: list[str],
+) -> bool:
+    """
+    Check if repository has any permission changes (visibility or team updates).
+    """
+    logger.info(f"Checking repository {repo.id} ({repo.name}) for changes")
+
+    # Check for repository visibility changes using the sample document data
+    if _is_repo_visibility_changed_from_groups(
+        repo=repo,
+        current_external_group_ids=current_external_group_ids,
+    ):
+        logger.info(f"Repository {repo.id} ({repo.name}) has visibility changes")
+        return True
+
+    # Check for team membership changes if repository is private
+    if get_repository_visibility(
+        repo
+    ) == GitHubVisibility.PRIVATE and _teams_updated_from_groups(
+        repo=repo,
+        github_client=github_client,
+        current_external_group_ids=current_external_group_ids,
+    ):
+        logger.info(f"Repository {repo.id} ({repo.name}) has team changes")
+        return True
+
+    logger.info(f"Repository {repo.id} ({repo.name}) has no changes")
+    return False
+
+
+def _is_repo_visibility_changed_from_groups(
+    repo: Repository,
+    current_external_group_ids: list[str],
+) -> bool:
+    """
+    Check if repository visibility has changed by analyzing existing external group IDs.
+
+    Args:
+        repo: GitHub repository object
+        current_external_group_ids: List of external group IDs from existing document
+
+    Returns:
+        True if visibility has changed
+    """
+    current_repo_visibility = get_repository_visibility(repo)
+    logger.info(f"Current repository visibility: {current_repo_visibility.value}")
+
+    # Build expected group IDs for current visibility
+    collaborators_group_id = build_ext_group_name_for_onyx(
+        source=DocumentSource.GITHUB,
+        ext_group_name=form_collaborators_group_id(repo.id),
+    )
+
+    org_group_id = None
+    if repo.organization:
+        org_group_id = build_ext_group_name_for_onyx(
+            source=DocumentSource.GITHUB,
+            ext_group_name=form_organization_group_id(repo.organization.id),
+        )
+
+    # Determine existing visibility from group IDs
+    has_collaborators_group = collaborators_group_id in current_external_group_ids
+    has_org_group = org_group_id and org_group_id in current_external_group_ids
+
+    if has_collaborators_group:
+        existing_repo_visibility = GitHubVisibility.PRIVATE
+    elif has_org_group:
+        existing_repo_visibility = GitHubVisibility.INTERNAL
+    else:
+        existing_repo_visibility = GitHubVisibility.PUBLIC
+
+    logger.info(f"Inferred existing visibility: {existing_repo_visibility.value}")
+
+    visibility_changed = existing_repo_visibility != current_repo_visibility
+    if visibility_changed:
+        logger.info(
+            f"Visibility changed for repo {repo.id} ({repo.name}): "
+            f"{existing_repo_visibility.value} -> {current_repo_visibility.value}"
+        )
+
+    return visibility_changed
+
+
+def _teams_updated_from_groups(
+    repo: Repository,
+    github_client: Github,
+    current_external_group_ids: list[str],
+) -> bool:
+    """
+    Check if repository team memberships have changed using existing group IDs.
+    """
+    # Fetch current team slugs for the repository
+    current_teams = fetch_repository_team_slugs(repo=repo, github_client=github_client)
+    logger.info(
+        f"Current teams for repository {repo.id} (name: {repo.name}): {current_teams}"
+    )
+
+    # Build group IDs to exclude from team comparison (non-team groups)
+    collaborators_group_id = build_ext_group_name_for_onyx(
+        source=DocumentSource.GITHUB,
+        ext_group_name=form_collaborators_group_id(repo.id),
+    )
+    outside_collaborators_group_id = build_ext_group_name_for_onyx(
+        source=DocumentSource.GITHUB,
+        ext_group_name=form_outside_collaborators_group_id(repo.id),
+    )
+    non_team_group_ids = {collaborators_group_id, outside_collaborators_group_id}
+
+    # Extract existing team IDs from current external group IDs
+    existing_team_ids = set()
+    for group_id in current_external_group_ids:
+        # Skip all non-team groups, keep only team groups
+        if group_id not in non_team_group_ids:
+            existing_team_ids.add(group_id)
+
+    # Note: existing_team_ids from DB are already prefixed (e.g., "github__team-slug")
+    # but current_teams from API are raw team slugs, so we need to add the prefix
+    current_team_ids = set()
+    for team_slug in current_teams:
+        team_group_id = build_ext_group_name_for_onyx(
+            source=DocumentSource.GITHUB,
+            ext_group_name=team_slug,
+        )
+        current_team_ids.add(team_group_id)
+
+    logger.info(
+        f"Existing team IDs: {existing_team_ids}, Current team IDs: {current_team_ids}"
+    )
+
+    # Compare actual team IDs to detect changes
+    teams_changed = current_team_ids != existing_team_ids
+    if teams_changed:
+        logger.info(
+            f"Team changes detected for repo {repo.id} (name: {repo.name}): "
+            f"existing={existing_team_ids}, current={current_team_ids}"
+        )
+
+    return teams_changed
diff --git a/backend/ee/onyx/external_permissions/github/group_sync.py b/backend/ee/onyx/external_permissions/github/group_sync.py
new file mode 100644
index 00000000000..1519c298920
--- /dev/null
+++ b/backend/ee/onyx/external_permissions/github/group_sync.py
@@ -0,0 +1,46 @@
+from collections.abc import Generator
+
+from github import Repository
+
+from ee.onyx.db.external_perm import ExternalUserGroup
+from ee.onyx.external_permissions.github.utils import get_external_user_group
+from onyx.connectors.github.connector import GithubConnector
+from onyx.db.models import ConnectorCredentialPair
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+def github_group_sync(
+    tenant_id: str,
+    cc_pair: ConnectorCredentialPair,
+) -> Generator[ExternalUserGroup, None, None]:
+    github_connector: GithubConnector = GithubConnector(
+        **cc_pair.connector.connector_specific_config
+    )
+    github_connector.load_credentials(cc_pair.credential.credential_json)
+    if not github_connector.github_client:
+        raise ValueError("github_client is required")
+
+    logger.info("Starting GitHub group sync...")
+    repos: list[Repository.Repository] = []
+    if github_connector.repositories:
+        if "," in github_connector.repositories:
+            # Multiple repositories specified
+            repos = github_connector.get_github_repos(github_connector.github_client)
+        else:
+            # Single repository (backward compatibility)
+            repos = [github_connector.get_github_repo(github_connector.github_client)]
+    else:
+        # All repositories
+        repos = github_connector.get_all_repos(github_connector.github_client)
+
+    for repo in repos:
+        try:
+            for external_group in get_external_user_group(
+                repo, github_connector.github_client
+            ):
+                logger.info(f"External group: {external_group}")
+                yield external_group
+        except Exception as e:
+            logger.error(f"Error processing repository {repo.id} ({repo.name}): {e}")
diff --git a/backend/ee/onyx/external_permissions/github/utils.py b/backend/ee/onyx/external_permissions/github/utils.py
new file mode 100644
index 00000000000..6887023c3fd
--- /dev/null
+++ b/backend/ee/onyx/external_permissions/github/utils.py
@@ -0,0 +1,488 @@
+from collections.abc import Callable
+from enum import Enum
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import TypeVar
+
+from github import Github
+from github import RateLimitExceededException
+from github.GithubException import GithubException
+from github.NamedUser import NamedUser
+from github.Organization import Organization
+from github.PaginatedList import PaginatedList
+from github.Repository import Repository
+from github.Team import Team
+from pydantic import BaseModel
+
+from ee.onyx.db.external_perm import ExternalUserGroup
+from onyx.access.models import ExternalAccess
+from onyx.access.utils import build_ext_group_name_for_onyx
+from onyx.configs.constants import DocumentSource
+from onyx.connectors.github.rate_limit_utils import sleep_after_rate_limit_exception
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+class GitHubVisibility(Enum):
+    """GitHub repository visibility options."""
+
+    PUBLIC = "public"
+    PRIVATE = "private"
+    INTERNAL = "internal"
+
+
+MAX_RETRY_COUNT = 3
+
+T = TypeVar("T")
+
+# Higher-order function to wrap GitHub operations with retry and exception handling
+
+
+def _run_with_retry(
+    operation: Callable[[], T],
+    description: str,
+    github_client: Github,
+    retry_count: int = 0,
+) -> Optional[T]:
+    """Execute a GitHub operation with retry on rate limit and exception handling."""
+    logger.debug(f"Starting operation '{description}', attempt {retry_count + 1}")
+    try:
+        result = operation()
+        logger.debug(f"Operation '{description}' completed successfully")
+        return result
+    except RateLimitExceededException:
+        if retry_count < MAX_RETRY_COUNT:
+            sleep_after_rate_limit_exception(github_client)
+            logger.warning(
+                f"Rate limit exceeded while {description}. Retrying... "
+                f"(attempt {retry_count + 1}/{MAX_RETRY_COUNT})"
+            )
+            return _run_with_retry(
+                operation, description, github_client, retry_count + 1
+            )
+        else:
+            error_msg = f"Max retries exceeded for {description}"
+            logger.exception(error_msg)
+            raise RuntimeError(error_msg)
+    except GithubException as e:
+        logger.warning(f"GitHub API error during {description}: {e}")
+        return None
+    except Exception as e:
+        logger.exception(f"Unexpected error during {description}: {e}")
+        return None
+
+
+class UserInfo(BaseModel):
+    """Represents a GitHub user with their basic information."""
+
+    login: str
+    name: Optional[str] = None
+    email: Optional[str] = None
+
+
+class TeamInfo(BaseModel):
+    """Represents a GitHub team with its members."""
+
+    name: str
+    slug: str
+    members: List[UserInfo]
+
+
+def _fetch_organization_members(
+    github_client: Github, org_name: str, retry_count: int = 0
+) -> List[UserInfo]:
+    """Fetch all organization members including owners and regular members."""
+    org_members: List[UserInfo] = []
+    logger.info(f"Fetching organization members for {org_name}")
+
+    org = _run_with_retry(
+        lambda: github_client.get_organization(org_name),
+        f"get organization {org_name}",
+        github_client,
+    )
+    if not org:
+        logger.error(f"Failed to fetch organization {org_name}")
+        raise RuntimeError(f"Failed to fetch organization {org_name}")
+
+    member_objs: PaginatedList[NamedUser] | list[NamedUser] = (
+        _run_with_retry(
+            lambda: org.get_members(filter_="all"),
+            f"get members for organization {org_name}",
+            github_client,
+        )
+        or []
+    )
+
+    for member in member_objs:
+        user_info = UserInfo(login=member.login, name=member.name, email=member.email)
+        org_members.append(user_info)
+
+    logger.info(f"Fetched {len(org_members)} members for organization {org_name}")
+    return org_members
+
+
+def _fetch_repository_teams_detailed(
+    repo: Repository, github_client: Github, retry_count: int = 0
+) -> List[TeamInfo]:
+    """Fetch teams with access to the repository and their members."""
+    teams_data: List[TeamInfo] = []
+    logger.info(f"Fetching teams for repository {repo.full_name}")
+
+    team_objs: PaginatedList[Team] | list[Team] = (
+        _run_with_retry(
+            lambda: repo.get_teams(),
+            f"get teams for repository {repo.full_name}",
+            github_client,
+        )
+        or []
+    )
+
+    for team in team_objs:
+        logger.info(
+            f"Processing team {team.name} (slug: {team.slug}) for repository {repo.full_name}"
+        )
+
+        members: PaginatedList[NamedUser] | list[NamedUser] = (
+            _run_with_retry(
+                lambda: team.get_members(),
+                f"get members for team {team.name}",
+                github_client,
+            )
+            or []
+        )
+
+        team_members = []
+        for m in members:
+            user_info = UserInfo(login=m.login, name=m.name, email=m.email)
+            team_members.append(user_info)
+
+        team_info = TeamInfo(name=team.name, slug=team.slug, members=team_members)
+        teams_data.append(team_info)
+        logger.info(f"Team {team.name} has {len(team_members)} members")
+
+    logger.info(f"Fetched {len(teams_data)} teams for repository {repo.full_name}")
+    return teams_data
+
+
+def fetch_repository_team_slugs(
+    repo: Repository, github_client: Github, retry_count: int = 0
+) -> List[str]:
+    """Fetch team slugs with access to the repository."""
+    logger.info(f"Fetching team slugs for repository {repo.full_name}")
+    teams_data: List[str] = []
+
+    team_objs: PaginatedList[Team] | list[Team] = (
+        _run_with_retry(
+            lambda: repo.get_teams(),
+            f"get teams for repository {repo.full_name}",
+            github_client,
+        )
+        or []
+    )
+
+    for team in team_objs:
+        teams_data.append(team.slug)
+
+    logger.info(f"Fetched {len(teams_data)} team slugs for repository {repo.full_name}")
+    return teams_data
+
+
+def _get_collaborators_and_outside_collaborators(
+    github_client: Github,
+    repo: Repository,
+) -> Tuple[List[UserInfo], List[UserInfo]]:
+    """Fetch and categorize collaborators into regular and outside collaborators."""
+    collaborators: List[UserInfo] = []
+    outside_collaborators: List[UserInfo] = []
+    logger.info(f"Fetching collaborators for repository {repo.full_name}")
+
+    repo_collaborators: PaginatedList[NamedUser] | list[NamedUser] = (
+        _run_with_retry(
+            lambda: repo.get_collaborators(),
+            f"get collaborators for repository {repo.full_name}",
+            github_client,
+        )
+        or []
+    )
+
+    for collaborator in repo_collaborators:
+        is_outside = False
+
+        # Check if collaborator is outside the organization
+        if repo.organization:
+            org: Organization | None = _run_with_retry(
+                lambda: github_client.get_organization(repo.organization.login),
+                f"get organization {repo.organization.login}",
+                github_client,
+            )
+
+            if org is not None:
+                org_obj = org
+                membership = _run_with_retry(
+                    lambda: org_obj.has_in_members(collaborator),
+                    f"check membership for {collaborator.login} in org {org_obj.login}",
+                    github_client,
+                )
+                is_outside = membership is not None and not membership
+
+        info = UserInfo(
+            login=collaborator.login, name=collaborator.name, email=collaborator.email
+        )
+        if repo.organization and is_outside:
+            outside_collaborators.append(info)
+        else:
+            collaborators.append(info)
+
+    logger.info(
+        f"Categorized {len(collaborators)} regular and {len(outside_collaborators)} outside collaborators for {repo.full_name}"
+    )
+    return collaborators, outside_collaborators
+
+
+def form_collaborators_group_id(repository_id: int) -> str:
+    """Generate group ID for repository collaborators."""
+    if not repository_id:
+        logger.exception("Repository ID is required to generate collaborators group ID")
+        raise ValueError("Repository ID must be set to generate group ID.")
+    group_id = f"{repository_id}_collaborators"
+    return group_id
+
+
+def form_organization_group_id(organization_id: int) -> str:
+    """Generate group ID for organization using organization ID."""
+    if not organization_id:
+        logger.exception(
+            "Organization ID is required to generate organization group ID"
+        )
+        raise ValueError("Organization ID must be set to generate group ID.")
+    group_id = f"{organization_id}_organization"
+    return group_id
+
+
+def form_outside_collaborators_group_id(repository_id: int) -> str:
+    """Generate group ID for outside collaborators."""
+    if not repository_id:
+        logger.exception(
+            "Repository ID is required to generate outside collaborators group ID"
+        )
+        raise ValueError("Repository ID must be set to generate group ID.")
+    group_id = f"{repository_id}_outside_collaborators"
+    return group_id
+
+
+def get_repository_visibility(repo: Repository) -> GitHubVisibility:
+    """
+    Get the visibility of a repository.
+    Returns GitHubVisibility enum member.
+    """
+    if hasattr(repo, "visibility"):
+        visibility = repo.visibility
+        logger.info(
+            f"Repository {repo.full_name} visibility from attribute: {visibility}"
+        )
+        try:
+            return GitHubVisibility(visibility)
+        except ValueError:
+            logger.warning(
+                f"Unknown visibility '{visibility}' for repo {repo.full_name}, defaulting to private"
+            )
+            return GitHubVisibility.PRIVATE
+
+    logger.info(f"Repository {repo.full_name} is private")
+    return GitHubVisibility.PRIVATE
+
+
+def get_external_access_permission(
+    repo: Repository, github_client: Github, add_prefix: bool = False
+) -> ExternalAccess:
+    """
+    Get the external access permission for a repository.
+    Uses group-based permissions for efficiency and scalability.
+
+    add_prefix: When this method is called during the initial permission sync via the connector,
+                the group ID isn't prefixed with the source while inserting the document record.
+                So in that case, set add_prefix to True, allowing the method itself to handle
+                prefixing. However, when the same method is invoked from doc_sync, our system
+                already adds the prefix to the group ID while processing the ExternalAccess object.
+    """
+    # We maintain collaborators, and outside collaborators as two separate groups
+    # instead of adding individual user emails to ExternalAccess.external_user_emails for two reasons:
+    # 1. Changes in repo collaborators (additions/removals) would require updating all documents.
+    # 2. Repo permissions can change without updating the repo's updated_at timestamp,
+    #    forcing full permission syncs for all documents every time, which is inefficient.
+
+    repo_visibility = get_repository_visibility(repo)
+    logger.info(
+        f"Generating ExternalAccess for {repo.full_name}: visibility={repo_visibility.value}"
+    )
+
+    if repo_visibility == GitHubVisibility.PUBLIC:
+        logger.info(
+            f"Repository {repo.full_name} is public - allowing access to all users"
+        )
+        return ExternalAccess(
+            external_user_emails=set(),
+            external_user_group_ids=set(),
+            is_public=True,
+        )
+    elif repo_visibility == GitHubVisibility.PRIVATE:
+        logger.info(
+            f"Repository {repo.full_name} is private - setting up restricted access"
+        )
+
+        collaborators_group_id = form_collaborators_group_id(repo.id)
+        outside_collaborators_group_id = form_outside_collaborators_group_id(repo.id)
+        if add_prefix:
+            collaborators_group_id = build_ext_group_name_for_onyx(
+                source=DocumentSource.GITHUB,
+                ext_group_name=collaborators_group_id,
+            )
+            outside_collaborators_group_id = build_ext_group_name_for_onyx(
+                source=DocumentSource.GITHUB,
+                ext_group_name=outside_collaborators_group_id,
+            )
+        group_ids = {collaborators_group_id, outside_collaborators_group_id}
+
+        team_slugs = fetch_repository_team_slugs(repo, github_client)
+        if add_prefix:
+            team_slugs = [
+                build_ext_group_name_for_onyx(
+                    source=DocumentSource.GITHUB,
+                    ext_group_name=slug,
+                )
+                for slug in team_slugs
+            ]
+        group_ids.update(team_slugs)
+
+        logger.info(f"ExternalAccess groups for {repo.full_name}: {group_ids}")
+        return ExternalAccess(
+            external_user_emails=set(),
+            external_user_group_ids=group_ids,
+            is_public=False,
+        )
+    else:
+        # Internal repositories - accessible to organization members
+        logger.info(
+            f"Repository {repo.full_name} is internal - accessible to org members"
+        )
+        org_group_id = form_organization_group_id(repo.organization.id)
+        if add_prefix:
+            org_group_id = build_ext_group_name_for_onyx(
+                source=DocumentSource.GITHUB,
+                ext_group_name=org_group_id,
+            )
+        group_ids = {org_group_id}
+        logger.info(f"ExternalAccess groups for {repo.full_name}: {group_ids}")
+        return ExternalAccess(
+            external_user_emails=set(),
+            external_user_group_ids=group_ids,
+            is_public=False,
+        )
+
+
+def get_external_user_group(
+    repo: Repository, github_client: Github
+) -> list[ExternalUserGroup]:
+    """
+    Get the external user group for a repository.
+    Creates ExternalUserGroup objects with actual user emails for each permission group.
+    """
+    repo_visibility = get_repository_visibility(repo)
+    logger.info(
+        f"Generating ExternalUserGroups for {repo.full_name}: visibility={repo_visibility.value}"
+    )
+
+    if repo_visibility == GitHubVisibility.PRIVATE:
+        logger.info(f"Processing private repository {repo.full_name}")
+
+        collaborators, outside_collaborators = (
+            _get_collaborators_and_outside_collaborators(github_client, repo)
+        )
+        teams = _fetch_repository_teams_detailed(repo, github_client)
+        external_user_groups = []
+
+        user_emails = set()
+        for collab in collaborators:
+            if collab.email:
+                user_emails.add(collab.email)
+            else:
+                logger.error(f"Collaborator {collab.login} has no email")
+
+        if user_emails:
+            collaborators_group = ExternalUserGroup(
+                id=form_collaborators_group_id(repo.id),
+                user_emails=list(user_emails),
+            )
+            external_user_groups.append(collaborators_group)
+            logger.info(f"Created collaborators group with {len(user_emails)} emails")
+
+        # Create group for outside collaborators
+        user_emails = set()
+        for collab in outside_collaborators:
+            if collab.email:
+                user_emails.add(collab.email)
+            else:
+                logger.error(f"Outside collaborator {collab.login} has no email")
+
+        if user_emails:
+            outside_collaborators_group = ExternalUserGroup(
+                id=form_outside_collaborators_group_id(repo.id),
+                user_emails=list(user_emails),
+            )
+            external_user_groups.append(outside_collaborators_group)
+            logger.info(
+                f"Created outside collaborators group with {len(user_emails)} emails"
+            )
+
+        # Create groups for teams
+        for team in teams:
+            user_emails = set()
+            for member in team.members:
+                if member.email:
+                    user_emails.add(member.email)
+                else:
+                    logger.error(f"Team member {member.login} has no email")
+
+            if user_emails:
+                team_group = ExternalUserGroup(
+                    id=team.slug,
+                    user_emails=list(user_emails),
+                )
+                external_user_groups.append(team_group)
+                logger.info(
+                    f"Created team group {team.name} with {len(user_emails)} emails"
+                )
+
+        logger.info(
+            f"Created {len(external_user_groups)} ExternalUserGroups for private repository {repo.full_name}"
+        )
+        return external_user_groups
+
+    if repo_visibility == GitHubVisibility.INTERNAL:
+        logger.info(f"Processing internal repository {repo.full_name}")
+
+        org_group_id = form_organization_group_id(repo.organization.id)
+        org_members = _fetch_organization_members(
+            github_client, repo.organization.login
+        )
+
+        user_emails = set()
+        for member in org_members:
+            if member.email:
+                user_emails.add(member.email)
+            else:
+                logger.error(f"Org member {member.login} has no email")
+
+        org_group = ExternalUserGroup(
+            id=org_group_id,
+            user_emails=list(user_emails),
+        )
+        logger.info(
+            f"Created organization group with {len(user_emails)} emails for internal repository {repo.full_name}"
+        )
+        return [org_group]
+
+    logger.info(f"Repository {repo.full_name} is public - no user groups needed")
+    return []
diff --git a/backend/ee/onyx/external_permissions/gmail/doc_sync.py b/backend/ee/onyx/external_permissions/gmail/doc_sync.py
index 5cdbc9128a8..f0f43387c6e 100644
--- a/backend/ee/onyx/external_permissions/gmail/doc_sync.py
+++ b/backend/ee/onyx/external_permissions/gmail/doc_sync.py
@@ -3,6 +3,7 @@
 from datetime import timezone
 
 from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsFunction
+from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsIdsFunction
 from onyx.access.models import DocExternalAccess
 from onyx.connectors.gmail.connector import GmailConnector
 from onyx.connectors.interfaces import GenerateSlimDocumentOutput
@@ -35,6 +36,7 @@ def _get_slim_doc_generator(
 def gmail_doc_sync(
     cc_pair: ConnectorCredentialPair,
     fetch_all_existing_docs_fn: FetchAllDocumentsFunction,
+    fetch_all_existing_docs_ids_fn: FetchAllDocumentsIdsFunction,
     callback: IndexingHeartbeatInterface | None,
 ) -> Generator[DocExternalAccess, None, None]:
     """
diff --git a/backend/ee/onyx/external_permissions/google_drive/doc_sync.py b/backend/ee/onyx/external_permissions/google_drive/doc_sync.py
index 9dfbc50c0ed..6feedbb3ff1 100644
--- a/backend/ee/onyx/external_permissions/google_drive/doc_sync.py
+++ b/backend/ee/onyx/external_permissions/google_drive/doc_sync.py
@@ -8,6 +8,7 @@
     get_permissions_by_ids,
 )
 from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsFunction
+from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsIdsFunction
 from onyx.access.models import DocExternalAccess
 from onyx.access.models import ExternalAccess
 from onyx.connectors.google_drive.connector import GoogleDriveConnector
@@ -169,6 +170,7 @@ def _get_permissions(
 def gdrive_doc_sync(
     cc_pair: ConnectorCredentialPair,
     fetch_all_existing_docs_fn: FetchAllDocumentsFunction,
+    fetch_all_existing_docs_ids_fn: FetchAllDocumentsIdsFunction,
     callback: IndexingHeartbeatInterface | None,
 ) -> Generator[DocExternalAccess, None, None]:
     """
diff --git a/backend/ee/onyx/external_permissions/jira/doc_sync.py b/backend/ee/onyx/external_permissions/jira/doc_sync.py
index 01e2bdcc472..60a6ec2d379 100644
--- a/backend/ee/onyx/external_permissions/jira/doc_sync.py
+++ b/backend/ee/onyx/external_permissions/jira/doc_sync.py
@@ -1,6 +1,7 @@
 from collections.abc import Generator
 
 from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsFunction
+from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsIdsFunction
 from ee.onyx.external_permissions.utils import generic_doc_sync
 from onyx.access.models import DocExternalAccess
 from onyx.configs.constants import DocumentSource
@@ -17,6 +18,7 @@
 def jira_doc_sync(
     cc_pair: ConnectorCredentialPair,
     fetch_all_existing_docs_fn: FetchAllDocumentsFunction,
+    fetch_all_existing_docs_ids_fn: FetchAllDocumentsIdsFunction,
     callback: IndexingHeartbeatInterface | None = None,
 ) -> Generator[DocExternalAccess, None, None]:
     jira_connector = JiraConnector(
@@ -26,7 +28,7 @@ def jira_doc_sync(
 
     yield from generic_doc_sync(
         cc_pair=cc_pair,
-        fetch_all_existing_docs_fn=fetch_all_existing_docs_fn,
+        fetch_all_existing_docs_ids_fn=fetch_all_existing_docs_ids_fn,
         callback=callback,
         doc_source=DocumentSource.JIRA,
         slim_connector=jira_connector,
diff --git a/backend/ee/onyx/external_permissions/perm_sync_types.py b/backend/ee/onyx/external_permissions/perm_sync_types.py
index 7cb19e5d2b7..56e8ace128e 100644
--- a/backend/ee/onyx/external_permissions/perm_sync_types.py
+++ b/backend/ee/onyx/external_permissions/perm_sync_types.py
@@ -5,6 +5,8 @@
 from typing import TYPE_CHECKING
 
 from onyx.context.search.models import InferenceChunk
+from onyx.db.utils import DocumentRow
+from onyx.db.utils import SortOrder
 
 # Avoid circular imports
 if TYPE_CHECKING:
@@ -15,14 +17,34 @@
 
 
 class FetchAllDocumentsFunction(Protocol):
-    """Protocol for a function that fetches all document IDs for a connector credential pair."""
+    """Protocol for a function that fetches documents for a connector credential pair.
 
-    def __call__(self) -> list[str]:
+    This protocol defines the interface for functions that retrieve documents
+    from the database, typically used in permission synchronization workflows.
+    """
+
+    def __call__(
+        self,
+        sort_order: SortOrder | None,
+    ) -> list[DocumentRow]:
+        """
+        Fetches documents for a connector credential pair.
         """
-        Returns a list of document IDs for a connector credential pair.
+        ...
+
 
-        This is typically used to determine which documents should no longer be
-        accessible during the document sync process.
+class FetchAllDocumentsIdsFunction(Protocol):
+    """Protocol for a function that fetches document IDs for a connector credential pair.
+
+    This protocol defines the interface for functions that retrieve document IDs
+    from the database, typically used in permission synchronization workflows.
+    """
+
+    def __call__(
+        self,
+    ) -> list[str]:
+        """
+        Fetches document IDs for a connector credential pair.
         """
         ...
 
@@ -32,6 +54,7 @@ def __call__(self) -> list[str]:
     [
         "ConnectorCredentialPair",
         FetchAllDocumentsFunction,
+        FetchAllDocumentsIdsFunction,
         Optional["IndexingHeartbeatInterface"],
     ],
     Generator["DocExternalAccess", None, None],
diff --git a/backend/ee/onyx/external_permissions/slack/doc_sync.py b/backend/ee/onyx/external_permissions/slack/doc_sync.py
index db30c839de7..f55bedf7900 100644
--- a/backend/ee/onyx/external_permissions/slack/doc_sync.py
+++ b/backend/ee/onyx/external_permissions/slack/doc_sync.py
@@ -3,6 +3,7 @@
 from slack_sdk import WebClient
 
 from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsFunction
+from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsIdsFunction
 from ee.onyx.external_permissions.slack.utils import fetch_user_id_to_email_map
 from onyx.access.models import DocExternalAccess
 from onyx.access.models import ExternalAccess
@@ -130,6 +131,7 @@ def _get_slack_document_access(
 def slack_doc_sync(
     cc_pair: ConnectorCredentialPair,
     fetch_all_existing_docs_fn: FetchAllDocumentsFunction,
+    fetch_all_existing_docs_ids_fn: FetchAllDocumentsIdsFunction,
     callback: IndexingHeartbeatInterface | None,
 ) -> Generator[DocExternalAccess, None, None]:
     """
diff --git a/backend/ee/onyx/external_permissions/sync_params.py b/backend/ee/onyx/external_permissions/sync_params.py
index 60d3c9f7687..6ebaec10372 100644
--- a/backend/ee/onyx/external_permissions/sync_params.py
+++ b/backend/ee/onyx/external_permissions/sync_params.py
@@ -7,12 +7,16 @@
 from ee.onyx.configs.app_configs import CONFLUENCE_PERMISSION_DOC_SYNC_FREQUENCY
 from ee.onyx.configs.app_configs import CONFLUENCE_PERMISSION_GROUP_SYNC_FREQUENCY
 from ee.onyx.configs.app_configs import DEFAULT_PERMISSION_DOC_SYNC_FREQUENCY
+from ee.onyx.configs.app_configs import GITHUB_PERMISSION_DOC_SYNC_FREQUENCY
+from ee.onyx.configs.app_configs import GITHUB_PERMISSION_GROUP_SYNC_FREQUENCY
 from ee.onyx.configs.app_configs import GOOGLE_DRIVE_PERMISSION_GROUP_SYNC_FREQUENCY
 from ee.onyx.configs.app_configs import JIRA_PERMISSION_DOC_SYNC_FREQUENCY
 from ee.onyx.configs.app_configs import SLACK_PERMISSION_DOC_SYNC_FREQUENCY
 from ee.onyx.configs.app_configs import TEAMS_PERMISSION_DOC_SYNC_FREQUENCY
 from ee.onyx.external_permissions.confluence.doc_sync import confluence_doc_sync
 from ee.onyx.external_permissions.confluence.group_sync import confluence_group_sync
+from ee.onyx.external_permissions.github.doc_sync import github_doc_sync
+from ee.onyx.external_permissions.github.group_sync import github_group_sync
 from ee.onyx.external_permissions.gmail.doc_sync import gmail_doc_sync
 from ee.onyx.external_permissions.google_drive.doc_sync import gdrive_doc_sync
 from ee.onyx.external_permissions.google_drive.group_sync import gdrive_group_sync
@@ -20,6 +24,7 @@
 from ee.onyx.external_permissions.perm_sync_types import CensoringFuncType
 from ee.onyx.external_permissions.perm_sync_types import DocSyncFuncType
 from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsFunction
+from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsIdsFunction
 from ee.onyx.external_permissions.perm_sync_types import GroupSyncFuncType
 from ee.onyx.external_permissions.salesforce.postprocessing import (
     censor_salesforce_chunks,
@@ -63,6 +68,7 @@ class SyncConfig(BaseModel):
 def mock_doc_sync(
     cc_pair: "ConnectorCredentialPair",
     fetch_all_docs_fn: FetchAllDocumentsFunction,
+    fetch_all_docs_ids_fn: FetchAllDocumentsIdsFunction,
     callback: Optional["IndexingHeartbeatInterface"],
 ) -> Generator["DocExternalAccess", None, None]:
     """Mock doc sync function for testing - returns empty list since permissions are fetched during indexing"""
@@ -117,6 +123,18 @@ def mock_doc_sync(
             initial_index_should_sync=False,
         ),
     ),
+    DocumentSource.GITHUB: SyncConfig(
+        doc_sync_config=DocSyncConfig(
+            doc_sync_frequency=GITHUB_PERMISSION_DOC_SYNC_FREQUENCY,
+            doc_sync_func=github_doc_sync,
+            initial_index_should_sync=True,
+        ),
+        group_sync_config=GroupSyncConfig(
+            group_sync_frequency=GITHUB_PERMISSION_GROUP_SYNC_FREQUENCY,
+            group_sync_func=github_group_sync,
+            group_sync_is_cc_pair_agnostic=False,
+        ),
+    ),
     DocumentSource.SALESFORCE: SyncConfig(
         censoring_config=CensoringConfig(
             chunk_censoring_func=censor_salesforce_chunks,
diff --git a/backend/ee/onyx/external_permissions/teams/doc_sync.py b/backend/ee/onyx/external_permissions/teams/doc_sync.py
index e160c5d916c..83517c66d54 100644
--- a/backend/ee/onyx/external_permissions/teams/doc_sync.py
+++ b/backend/ee/onyx/external_permissions/teams/doc_sync.py
@@ -1,6 +1,7 @@
 from collections.abc import Generator
 
 from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsFunction
+from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsIdsFunction
 from ee.onyx.external_permissions.utils import generic_doc_sync
 from onyx.access.models import DocExternalAccess
 from onyx.configs.constants import DocumentSource
@@ -18,6 +19,7 @@
 def teams_doc_sync(
     cc_pair: ConnectorCredentialPair,
     fetch_all_existing_docs_fn: FetchAllDocumentsFunction,
+    fetch_all_existing_docs_ids_fn: FetchAllDocumentsIdsFunction,
     callback: IndexingHeartbeatInterface | None,
 ) -> Generator[DocExternalAccess, None, None]:
     teams_connector = TeamsConnector(
@@ -27,7 +29,7 @@ def teams_doc_sync(
 
     yield from generic_doc_sync(
         cc_pair=cc_pair,
-        fetch_all_existing_docs_fn=fetch_all_existing_docs_fn,
+        fetch_all_existing_docs_ids_fn=fetch_all_existing_docs_ids_fn,
         callback=callback,
         doc_source=DocumentSource.TEAMS,
         slim_connector=teams_connector,
diff --git a/backend/ee/onyx/external_permissions/utils.py b/backend/ee/onyx/external_permissions/utils.py
index d6d20e1cd7f..65fc0e1e190 100644
--- a/backend/ee/onyx/external_permissions/utils.py
+++ b/backend/ee/onyx/external_permissions/utils.py
@@ -1,6 +1,6 @@
 from collections.abc import Generator
 
-from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsFunction
+from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsIdsFunction
 from onyx.access.models import DocExternalAccess
 from onyx.access.models import ExternalAccess
 from onyx.configs.constants import DocumentSource
@@ -14,7 +14,7 @@
 
 def generic_doc_sync(
     cc_pair: ConnectorCredentialPair,
-    fetch_all_existing_docs_fn: FetchAllDocumentsFunction,
+    fetch_all_existing_docs_ids_fn: FetchAllDocumentsIdsFunction,
     callback: IndexingHeartbeatInterface | None,
     doc_source: DocumentSource,
     slim_connector: SlimConnector,
@@ -62,9 +62,9 @@ def generic_doc_sync(
             )
 
     logger.info(f"Querying existing document IDs for CC Pair ID: {cc_pair.id=}")
-    existing_doc_ids = set(fetch_all_existing_docs_fn())
+    existing_doc_ids: list[str] = fetch_all_existing_docs_ids_fn()
 
-    missing_doc_ids = existing_doc_ids - newly_fetched_doc_ids
+    missing_doc_ids = set(existing_doc_ids) - newly_fetched_doc_ids
 
     if not missing_doc_ids:
         return
diff --git a/backend/onyx/connectors/connector_runner.py b/backend/onyx/connectors/connector_runner.py
index 1b499e73343..5555a988837 100644
--- a/backend/onyx/connectors/connector_runner.py
+++ b/backend/onyx/connectors/connector_runner.py
@@ -27,7 +27,8 @@
 
 class CheckpointOutputWrapper(Generic[CT]):
     """
-    Wraps a CheckpointOutput generator to give things back in a more digestible format.
+    Wraps a CheckpointOutput generator to give things back in a more digestible format,
+    specifically for Document outputs.
     The connector format is easier for the connector implementor (e.g. it enforces exactly
     one new checkpoint is returned AND that the checkpoint is at the end), thus the different
     formats.
@@ -131,7 +132,7 @@ def run(self, checkpoint: CT) -> Generator[
                 for document, failure, next_checkpoint in CheckpointOutputWrapper[CT]()(
                     checkpoint_connector_generator
                 ):
-                    if document is not None:
+                    if document is not None and isinstance(document, Document):
                         self.doc_batch.append(document)
 
                     if failure is not None:
diff --git a/backend/onyx/connectors/github/connector.py b/backend/onyx/connectors/github/connector.py
index 34cc703c605..2b616cf995c 100644
--- a/backend/onyx/connectors/github/connector.py
+++ b/backend/onyx/connectors/github/connector.py
@@ -1,5 +1,4 @@
 import copy
-import time
 from collections.abc import Callable
 from collections.abc import Generator
 from datetime import datetime
@@ -17,17 +16,22 @@
 from github.NamedUser import NamedUser
 from github.PaginatedList import PaginatedList
 from github.PullRequest import PullRequest
-from github.Requester import Requester
 from pydantic import BaseModel
 from typing_extensions import override
 
+from onyx.access.models import ExternalAccess
 from onyx.configs.app_configs import GITHUB_CONNECTOR_BASE_URL
 from onyx.configs.constants import DocumentSource
+from onyx.connectors.connector_runner import ConnectorRunner
 from onyx.connectors.exceptions import ConnectorValidationError
 from onyx.connectors.exceptions import CredentialExpiredError
 from onyx.connectors.exceptions import InsufficientPermissionsError
 from onyx.connectors.exceptions import UnexpectedValidationError
-from onyx.connectors.interfaces import CheckpointedConnector
+from onyx.connectors.github.models import SerializedRepository
+from onyx.connectors.github.rate_limit_utils import sleep_after_rate_limit_exception
+from onyx.connectors.github.utils import deserialize_repository
+from onyx.connectors.github.utils import get_external_access_permission
+from onyx.connectors.interfaces import CheckpointedConnectorWithPermSync
 from onyx.connectors.interfaces import CheckpointOutput
 from onyx.connectors.interfaces import ConnectorCheckpoint
 from onyx.connectors.interfaces import ConnectorFailure
@@ -46,17 +50,7 @@
 _MAX_NUM_RATE_LIMIT_RETRIES = 5
 
 ONE_DAY = timedelta(days=1)
-
-
-def _sleep_after_rate_limit_exception(github_client: Github) -> None:
-    sleep_time = github_client.get_rate_limit().core.reset.replace(
-        tzinfo=timezone.utc
-    ) - datetime.now(tz=timezone.utc)
-    sleep_time += timedelta(minutes=1)  # add an extra minute just to be safe
-    logger.notice(f"Ran into Github rate-limit. Sleeping {sleep_time.seconds} seconds.")
-    time.sleep(sleep_time.seconds)
-
-
+SLIM_BATCH_SIZE = 100
 # Cases
 # X (from start) standard run, no fallback to cursor-based pagination
 # X (from start) standard run errors, fallback to cursor-based pagination
@@ -72,6 +66,10 @@ def _sleep_after_rate_limit_exception(github_client: Github) -> None:
 # checkpoint progress (no infinite loop)
 
 
+class DocMetadata(BaseModel):
+    repo: str
+
+
 def get_nextUrl_key(pag_list: PaginatedList[PullRequest | Issue]) -> str:
     if "_PaginatedList__nextUrl" in pag_list.__dict__:
         return "_PaginatedList__nextUrl"
@@ -190,7 +188,7 @@ def _get_batch_rate_limited(
                 getattr(obj, "raw_data")
         yield from objs
     except RateLimitExceededException:
-        _sleep_after_rate_limit_exception(github_client)
+        sleep_after_rate_limit_exception(github_client)
         yield from _get_batch_rate_limited(
             git_objs,
             page_num,
@@ -232,12 +230,17 @@ def _get_userinfo(user: NamedUser) -> dict[str, str]:
     }
 
 
-def _convert_pr_to_document(pull_request: PullRequest) -> Document:
+def _convert_pr_to_document(
+    pull_request: PullRequest, repo_external_access: ExternalAccess | None
+) -> Document:
+    repo_name = pull_request.base.repo.full_name if pull_request.base else ""
+    doc_metadata = DocMetadata(repo=repo_name)
     return Document(
         id=pull_request.html_url,
         sections=[
             TextSection(link=pull_request.html_url, text=pull_request.body or "")
         ],
+        external_access=repo_external_access,
         source=DocumentSource.GITHUB,
         semantic_identifier=f"{pull_request.number}: {pull_request.title}",
         # updated_at is UTC time but is timezone unaware, explicitly add UTC
@@ -248,6 +251,8 @@ def _convert_pr_to_document(pull_request: PullRequest) -> Document:
             if pull_request.updated_at
             else None
         ),
+        # this metadata is used in perm sync
+        doc_metadata=doc_metadata.model_dump(),
         metadata={
             k: [str(vi) for vi in v] if isinstance(v, list) else str(v)
             for k, v in {
@@ -301,14 +306,21 @@ def _fetch_issue_comments(issue: Issue) -> str:
     return "\nComment: ".join(comment.body for comment in comments)
 
 
-def _convert_issue_to_document(issue: Issue) -> Document:
+def _convert_issue_to_document(
+    issue: Issue, repo_external_access: ExternalAccess | None
+) -> Document:
+    repo_name = issue.repository.full_name if issue.repository else ""
+    doc_metadata = DocMetadata(repo=repo_name)
     return Document(
         id=issue.html_url,
         sections=[TextSection(link=issue.html_url, text=issue.body or "")],
         source=DocumentSource.GITHUB,
+        external_access=repo_external_access,
         semantic_identifier=f"{issue.number}: {issue.title}",
         # updated_at is UTC time but is timezone unaware
         doc_updated_at=issue.updated_at.replace(tzinfo=timezone.utc),
+        # this metadata is used in perm sync
+        doc_metadata=doc_metadata.model_dump(),
         metadata={
             k: [str(vi) for vi in v] if isinstance(v, list) else str(v)
             for k, v in {
@@ -343,18 +355,6 @@ def _convert_issue_to_document(issue: Issue) -> Document:
     )
 
 
-class SerializedRepository(BaseModel):
-    # id is part of the raw_data as well, just pulled out for convenience
-    id: int
-    headers: dict[str, str | int]
-    raw_data: dict[str, Any]
-
-    def to_Repository(self, requester: Requester) -> Repository.Repository:
-        return Repository.Repository(
-            requester, self.headers, self.raw_data, completed=True
-        )
-
-
 class GithubConnectorStage(Enum):
     START = "start"
     PRS = "prs"
@@ -394,7 +394,7 @@ def cursor_url_callback(cursor_url: str | None, num_objs: int) -> None:
     return cursor_url_callback
 
 
-class GithubConnector(CheckpointedConnector[GithubConnectorCheckpoint]):
+class GithubConnector(CheckpointedConnectorWithPermSync[GithubConnectorCheckpoint]):
     def __init__(
         self,
         repo_owner: str,
@@ -423,7 +423,7 @@ def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None
         )
         return None
 
-    def _get_github_repo(
+    def get_github_repo(
         self, github_client: Github, attempt_num: int = 0
     ) -> Repository.Repository:
         if attempt_num > _MAX_NUM_RATE_LIMIT_RETRIES:
@@ -434,10 +434,10 @@ def _get_github_repo(
         try:
             return github_client.get_repo(f"{self.repo_owner}/{self.repositories}")
         except RateLimitExceededException:
-            _sleep_after_rate_limit_exception(github_client)
-            return self._get_github_repo(github_client, attempt_num + 1)
+            sleep_after_rate_limit_exception(github_client)
+            return self.get_github_repo(github_client, attempt_num + 1)
 
-    def _get_github_repos(
+    def get_github_repos(
         self, github_client: Github, attempt_num: int = 0
     ) -> list[Repository.Repository]:
         """Get specific repositories based on comma-separated repo_name string."""
@@ -465,10 +465,10 @@ def _get_github_repos(
 
             return repos
         except RateLimitExceededException:
-            _sleep_after_rate_limit_exception(github_client)
-            return self._get_github_repos(github_client, attempt_num + 1)
+            sleep_after_rate_limit_exception(github_client)
+            return self.get_github_repos(github_client, attempt_num + 1)
 
-    def _get_all_repos(
+    def get_all_repos(
         self, github_client: Github, attempt_num: int = 0
     ) -> list[Repository.Repository]:
         if attempt_num > _MAX_NUM_RATE_LIMIT_RETRIES:
@@ -487,8 +487,8 @@ def _get_all_repos(
                 user = github_client.get_user(self.repo_owner)
                 return list(user.get_repos())
         except RateLimitExceededException:
-            _sleep_after_rate_limit_exception(github_client)
-            return self._get_all_repos(github_client, attempt_num + 1)
+            sleep_after_rate_limit_exception(github_client)
+            return self.get_all_repos(github_client, attempt_num + 1)
 
     def _pull_requests_func(
         self, repo: Repository.Repository
@@ -509,6 +509,7 @@ def _fetch_from_github(
         checkpoint: GithubConnectorCheckpoint,
         start: datetime | None = None,
         end: datetime | None = None,
+        include_permissions: bool = False,
     ) -> Generator[Document | ConnectorFailure, None, GithubConnectorCheckpoint]:
         if self.github_client is None:
             raise ConnectorMissingCredentialError("GitHub")
@@ -521,13 +522,13 @@ def _fetch_from_github(
             if self.repositories:
                 if "," in self.repositories:
                     # Multiple repositories specified
-                    repos = self._get_github_repos(self.github_client)
+                    repos = self.get_github_repos(self.github_client)
                 else:
                     # Single repository (backward compatibility)
-                    repos = [self._get_github_repo(self.github_client)]
+                    repos = [self.get_github_repo(self.github_client)]
             else:
                 # All repositories
-                repos = self._get_all_repos(self.github_client)
+                repos = self.get_all_repos(self.github_client)
             if not repos:
                 checkpoint.has_more = False
                 return checkpoint
@@ -547,28 +548,15 @@ def _fetch_from_github(
         if checkpoint.cached_repo is None:
             raise ValueError("No repo saved in checkpoint")
 
-        # Try to access the requester - different PyGithub versions may use different attribute names
-        try:
-            # Try direct access to a known attribute name first
-            if hasattr(self.github_client, "_requester"):
-                requester = self.github_client._requester
-            elif hasattr(self.github_client, "_Github__requester"):
-                requester = self.github_client._Github__requester
-            else:
-                # If we can't find the requester attribute, we need to fall back to recreating the repo
-                raise AttributeError("Could not find requester attribute")
-
-            repo = checkpoint.cached_repo.to_Repository(requester)
-        except Exception as e:
-            # If all else fails, re-fetch the repo directly
-            logger.warning(
-                f"Failed to deserialize repository: {e}. Attempting to re-fetch."
-            )
-            repo_id = checkpoint.cached_repo.id
-            repo = self.github_client.get_repo(repo_id)
+        # Deserialize the repository from the checkpoint
+        repo = deserialize_repository(checkpoint.cached_repo, self.github_client)
 
         cursor_url_callback = make_cursor_url_callback(checkpoint)
-
+        repo_external_access: ExternalAccess | None = None
+        if include_permissions:
+            repo_external_access = get_external_access_permission(
+                repo, self.github_client
+            )
         if self.include_prs and checkpoint.stage == GithubConnectorStage.PRS:
             logger.info(f"Fetching PRs for repo: {repo.name}")
 
@@ -603,7 +591,9 @@ def _fetch_from_github(
                 ):
                     continue
                 try:
-                    yield _convert_pr_to_document(cast(PullRequest, pr))
+                    yield _convert_pr_to_document(
+                        cast(PullRequest, pr), repo_external_access
+                    )
                 except Exception as e:
                     error_msg = f"Error converting PR to document: {e}"
                     logger.exception(error_msg)
@@ -653,6 +643,7 @@ def _fetch_from_github(
                     self.github_client,
                 )
             )
+            logger.info(f"Fetched {len(issue_batch)} issues for repo: {repo.name}")
             checkpoint.curr_page += 1
             done_with_issues = False
             num_issues = 0
@@ -678,7 +669,7 @@ def _fetch_from_github(
                     continue
 
                 try:
-                    yield _convert_issue_to_document(issue)
+                    yield _convert_issue_to_document(issue, repo_external_access)
                 except Exception as e:
                     error_msg = f"Error converting issue to document: {e}"
                     logger.exception(error_msg)
@@ -715,12 +706,16 @@ def _fetch_from_github(
             checkpoint.stage = GithubConnectorStage.PRS
             checkpoint.reset()
 
-        logger.info(f"{len(checkpoint.cached_repo_ids)} repos remaining")
+        if checkpoint.cached_repo_ids:
+            logger.info(
+                f"{len(checkpoint.cached_repo_ids)} repos remaining (IDs: {checkpoint.cached_repo_ids})"
+            )
+        else:
+            logger.info("No more repos remaining")
 
         return checkpoint
 
-    @override
-    def load_from_checkpoint(
+    def _load_from_checkpoint(
         self,
         start: SecondsSinceUnixEpoch,
         end: SecondsSinceUnixEpoch,
@@ -741,7 +736,32 @@ def load_from_checkpoint(
             adjusted_start_datetime = epoch
 
         return self._fetch_from_github(
-            checkpoint, start=adjusted_start_datetime, end=end_datetime
+            checkpoint,
+            start=adjusted_start_datetime,
+            end=end_datetime,
+            include_permissions=include_permissions,
+        )
+
+    @override
+    def load_from_checkpoint(
+        self,
+        start: SecondsSinceUnixEpoch,
+        end: SecondsSinceUnixEpoch,
+        checkpoint: GithubConnectorCheckpoint,
+    ) -> CheckpointOutput[GithubConnectorCheckpoint]:
+        return self._load_from_checkpoint(
+            start, end, checkpoint, include_permissions=False
+        )
+
+    @override
+    def load_from_checkpoint_with_perm_sync(
+        self,
+        start: SecondsSinceUnixEpoch,
+        end: SecondsSinceUnixEpoch,
+        checkpoint: GithubConnectorCheckpoint,
+    ) -> CheckpointOutput[GithubConnectorCheckpoint]:
+        return self._load_from_checkpoint(
+            start, end, checkpoint, include_permissions=True
         )
 
     def validate_connector_settings(self) -> None:
@@ -775,6 +795,9 @@ def validate_connector_settings(self) -> None:
                             test_repo = self.github_client.get_repo(
                                 f"{self.repo_owner}/{repo_name}"
                             )
+                            logger.info(
+                                f"Successfully accessed repository: {self.repo_owner}/{repo_name}"
+                            )
                             test_repo.get_contents("")
                             valid_repos = True
                             # If at least one repo is valid, we can proceed
@@ -882,7 +905,6 @@ def build_dummy_checkpoint(self) -> GithubConnectorCheckpoint:
 
 if __name__ == "__main__":
     import os
-    from onyx.connectors.connector_runner import ConnectorRunner
 
     # Initialize the connector
     connector = GithubConnector(
@@ -893,6 +915,12 @@ def build_dummy_checkpoint(self) -> GithubConnectorCheckpoint:
         {"github_access_token": os.environ["ACCESS_TOKEN_GITHUB"]}
     )
 
+    if connector.github_client:
+        get_external_access_permission(
+            connector.get_github_repos(connector.github_client).pop(),
+            connector.github_client,
+        )
+
     # Create a time range from epoch to now
     end_time = datetime.now(timezone.utc)
     start_time = datetime.fromtimestamp(0, tz=timezone.utc)
diff --git a/backend/onyx/connectors/github/models.py b/backend/onyx/connectors/github/models.py
new file mode 100644
index 00000000000..d5157b27536
--- /dev/null
+++ b/backend/onyx/connectors/github/models.py
@@ -0,0 +1,17 @@
+from typing import Any
+
+from github import Repository
+from github.Requester import Requester
+from pydantic import BaseModel
+
+
+class SerializedRepository(BaseModel):
+    # id is part of the raw_data as well, just pulled out for convenience
+    id: int
+    headers: dict[str, str | int]
+    raw_data: dict[str, Any]
+
+    def to_Repository(self, requester: Requester) -> Repository.Repository:
+        return Repository.Repository(
+            requester, self.headers, self.raw_data, completed=True
+        )
diff --git a/backend/onyx/connectors/github/rate_limit_utils.py b/backend/onyx/connectors/github/rate_limit_utils.py
new file mode 100644
index 00000000000..fdb7a5c22e4
--- /dev/null
+++ b/backend/onyx/connectors/github/rate_limit_utils.py
@@ -0,0 +1,25 @@
+import time
+from datetime import datetime
+from datetime import timedelta
+from datetime import timezone
+
+from github import Github
+
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+def sleep_after_rate_limit_exception(github_client: Github) -> None:
+    """
+    Sleep until the GitHub rate limit resets.
+
+    Args:
+        github_client: The GitHub client that hit the rate limit
+    """
+    sleep_time = github_client.get_rate_limit().core.reset.replace(
+        tzinfo=timezone.utc
+    ) - datetime.now(tz=timezone.utc)
+    sleep_time += timedelta(minutes=1)  # add an extra minute just to be safe
+    logger.notice(f"Ran into Github rate-limit. Sleeping {sleep_time.seconds} seconds.")
+    time.sleep(sleep_time.total_seconds())
diff --git a/backend/onyx/connectors/github/utils.py b/backend/onyx/connectors/github/utils.py
new file mode 100644
index 00000000000..e5029b3a7e1
--- /dev/null
+++ b/backend/onyx/connectors/github/utils.py
@@ -0,0 +1,63 @@
+from collections.abc import Callable
+from typing import cast
+
+from github import Github
+from github.Repository import Repository
+
+from onyx.access.models import ExternalAccess
+from onyx.connectors.github.models import SerializedRepository
+from onyx.utils.logger import setup_logger
+from onyx.utils.variable_functionality import fetch_versioned_implementation
+from onyx.utils.variable_functionality import global_version
+
+logger = setup_logger()
+
+
+def get_external_access_permission(
+    repo: Repository, github_client: Github
+) -> ExternalAccess:
+    """
+    Get the external access permission for a repository.
+    This functionality requires Enterprise Edition.
+    """
+    # Check if EE is enabled
+    if not global_version.is_ee_version():
+        # For the MIT version, return an empty ExternalAccess (private document)
+        return ExternalAccess.empty()
+
+    # Fetch the EE implementation
+    ee_get_external_access_permission = cast(
+        Callable[[Repository, Github, bool], ExternalAccess],
+        fetch_versioned_implementation(
+            "onyx.external_permissions.github.utils",
+            "get_external_access_permission",
+        ),
+    )
+
+    return ee_get_external_access_permission(repo, github_client, True)
+
+
+def deserialize_repository(
+    cached_repo: SerializedRepository, github_client: Github
+) -> Repository:
+    """
+    Deserialize a SerializedRepository back into a Repository object.
+    """
+    # Try to access the requester - different PyGithub versions may use different attribute names
+    try:
+        # Try to get the requester using getattr to avoid linter errors
+        requester = getattr(github_client, "_requester", None)
+        if requester is None:
+            requester = getattr(github_client, "_Github__requester", None)
+        if requester is None:
+            # If we can't find the requester attribute, we need to fall back to recreating the repo
+            raise AttributeError("Could not find requester attribute")
+
+        return cached_repo.to_Repository(requester)
+    except Exception as e:
+        # If all else fails, re-fetch the repo directly
+        logger.warning(
+            f"Failed to deserialize repository: {e}. Attempting to re-fetch."
+        )
+        repo_id = cached_repo.id
+        return github_client.get_repo(repo_id)
diff --git a/backend/onyx/connectors/models.py b/backend/onyx/connectors/models.py
index 4a3b10f0a42..76b3ce8cfc9 100644
--- a/backend/onyx/connectors/models.py
+++ b/backend/onyx/connectors/models.py
@@ -183,6 +183,7 @@ class DocumentBase(BaseModel):
 
     # only filled in EE for connectors w/ permission sync enabled
     external_access: ExternalAccess | None = None
+    doc_metadata: dict[str, Any] | None = None
 
     def get_title_for_document_index(
         self,
diff --git a/backend/onyx/db/document.py b/backend/onyx/db/document.py
index 9043b9e4cc5..42100daa886 100644
--- a/backend/onyx/db/document.py
+++ b/backend/onyx/db/document.py
@@ -48,7 +48,9 @@
     delete_from_kg_relationships_extraction_staging__no_commit,
 )
 from onyx.db.tag import delete_document_tags_for_documents__no_commit
+from onyx.db.utils import DocumentRow
 from onyx.db.utils import model_to_dict
+from onyx.db.utils import SortOrder
 from onyx.document_index.interfaces import DocumentMetadata
 from onyx.kg.models import KGStage
 from onyx.server.documents.models import ConnectorCredentialPairIdentifier
@@ -150,7 +152,7 @@ def get_documents_for_cc_pair(
 
 
 def get_document_ids_for_connector_credential_pair(
-    db_session: Session, connector_id: int, credential_id: int, limit: int | None = None
+    db_session: Session, connector_id: int, credential_id: int
 ) -> list[str]:
     doc_ids_stmt = select(DocumentByConnectorCredentialPair.id).where(
         and_(
@@ -161,6 +163,47 @@ def get_document_ids_for_connector_credential_pair(
     return list(db_session.execute(doc_ids_stmt).scalars().all())
 
 
+def get_documents_for_connector_credential_pair_limited_columns(
+    db_session: Session,
+    connector_id: int,
+    credential_id: int,
+    sort_order: SortOrder | None = None,
+) -> Sequence[DocumentRow]:
+
+    doc_ids_subquery = select(DocumentByConnectorCredentialPair.id).where(
+        and_(
+            DocumentByConnectorCredentialPair.connector_id == connector_id,
+            DocumentByConnectorCredentialPair.credential_id == credential_id,
+        )
+    )
+    doc_ids_subquery = doc_ids_subquery.join(
+        DbDocument, DocumentByConnectorCredentialPair.id == DbDocument.id
+    )
+
+    stmt = select(
+        DbDocument.id, DbDocument.doc_metadata, DbDocument.external_user_group_ids
+    )
+
+    stmt = stmt.where(DbDocument.id.in_(doc_ids_subquery))
+
+    if sort_order == SortOrder.ASC:
+        stmt = stmt.order_by(DbDocument.last_modified.asc())
+    elif sort_order == SortOrder.DESC:
+        stmt = stmt.order_by(DbDocument.last_modified.desc())
+
+    rows = db_session.execute(stmt).mappings().all()
+
+    doc_rows: list[DocumentRow] = []
+    for row in rows:
+        doc_row = DocumentRow(
+            id=row.id,
+            doc_metadata=row.doc_metadata,
+            external_user_group_ids=row.external_user_group_ids,
+        )
+        doc_rows.append(doc_row)
+    return doc_rows
+
+
 def get_documents_for_connector_credential_pair(
     db_session: Session, connector_id: int, credential_id: int, limit: int | None = None
 ) -> Sequence[DbDocument]:
@@ -370,6 +413,7 @@ def upsert_documents(
                         if doc.external_access
                         else {}
                     ),
+                    doc_metadata=doc.doc_metadata,
                 )
             )
             for doc in seen_documents.values()
@@ -389,6 +433,7 @@ def upsert_documents(
             "external_user_emails": insert_stmt.excluded.external_user_emails,
             "external_user_group_ids": insert_stmt.excluded.external_user_group_ids,
             "is_public": insert_stmt.excluded.is_public,
+            "doc_metadata": insert_stmt.excluded.doc_metadata,
         },
     )
     db_session.execute(on_conflict_stmt)
@@ -1031,7 +1076,7 @@ def reset_all_document_kg_stages(db_session: Session) -> int:
 
     # The hasattr check is needed for type checking, even though rowcount
     # is guaranteed to exist at runtime for UPDATE operations
-    return result.rowcount if hasattr(result, "rowcount") else 0
+    return result.rowcount if hasattr(result, "rowcount") else 0  # type: ignore
 
 
 def update_document_kg_stages(
@@ -1054,7 +1099,7 @@ def update_document_kg_stages(
     result = db_session.execute(stmt)
     # The hasattr check is needed for type checking, even though rowcount
     # is guaranteed to exist at runtime for UPDATE operations
-    return result.rowcount if hasattr(result, "rowcount") else 0
+    return result.rowcount if hasattr(result, "rowcount") else 0  # type: ignore
 
 
 def get_skipped_kg_documents(db_session: Session) -> list[str]:
diff --git a/backend/onyx/db/models.py b/backend/onyx/db/models.py
index ad9ecd2964f..1842571949a 100644
--- a/backend/onyx/db/models.py
+++ b/backend/onyx/db/models.py
@@ -608,6 +608,10 @@ class Document(Base):
     retrieval_feedbacks: Mapped[list["DocumentRetrievalFeedback"]] = relationship(
         "DocumentRetrievalFeedback", back_populates="document"
     )
+
+    doc_metadata: Mapped[dict[str, Any] | None] = mapped_column(
+        postgresql.JSONB(), nullable=True, default=None
+    )
     tags = relationship(
         "Tag",
         secondary=Document__Tag.__table__,
diff --git a/backend/onyx/db/utils.py b/backend/onyx/db/utils.py
index 26aa04c7ab7..a6ded97b2c8 100644
--- a/backend/onyx/db/utils.py
+++ b/backend/onyx/db/utils.py
@@ -1,7 +1,9 @@
+from enum import Enum
 from typing import Any
 
 from psycopg2 import errorcodes
 from psycopg2 import OperationalError
+from pydantic import BaseModel
 from sqlalchemy import inspect
 
 from onyx.db.models import Base
@@ -27,3 +29,14 @@ def is_retryable_sqlalchemy_error(exc: BaseException) -> bool:
         pgcode = getattr(getattr(exc, "orig", None), "pgcode", None)
         return pgcode in RETRYABLE_PG_CODES
     return False
+
+
+class DocumentRow(BaseModel):
+    id: str
+    doc_metadata: dict[str, Any]
+    external_user_group_ids: list[str]
+
+
+class SortOrder(str, Enum):
+    ASC = "asc"
+    DESC = "desc"
diff --git a/backend/onyx/document_index/interfaces.py b/backend/onyx/document_index/interfaces.py
index ba5395a8d5b..b8b005d5331 100644
--- a/backend/onyx/document_index/interfaces.py
+++ b/backend/onyx/document_index/interfaces.py
@@ -91,6 +91,7 @@ class DocumentMetadata:
     from_ingestion_api: bool = False
 
     external_access: ExternalAccess | None = None
+    doc_metadata: dict[str, Any] | None = None
 
 
 @dataclass
diff --git a/backend/onyx/indexing/indexing_pipeline.py b/backend/onyx/indexing/indexing_pipeline.py
index d83f1477981..167f3a0c6a9 100644
--- a/backend/onyx/indexing/indexing_pipeline.py
+++ b/backend/onyx/indexing/indexing_pipeline.py
@@ -142,6 +142,7 @@ def _upsert_documents_in_db(
             secondary_owners=get_experts_stores_representations(doc.secondary_owners),
             from_ingestion_api=doc.from_ingestion_api,
             external_access=doc.external_access,
+            doc_metadata=doc.doc_metadata,
         )
         document_metadata_list.append(db_doc_metadata)
 
diff --git a/backend/tests/daily/connectors/confluence/test_confluence_permissions_basic.py b/backend/tests/daily/connectors/confluence/test_confluence_permissions_basic.py
index d6b21033d3a..6da91b6b49f 100644
--- a/backend/tests/daily/connectors/confluence/test_confluence_permissions_basic.py
+++ b/backend/tests/daily/connectors/confluence/test_confluence_permissions_basic.py
@@ -10,6 +10,8 @@
 from onyx.connectors.confluence.connector import ConfluenceConnector
 from onyx.connectors.credentials_provider import OnyxStaticCredentialsProvider
 from onyx.db.models import ConnectorCredentialPair
+from onyx.db.utils import DocumentRow
+from onyx.db.utils import SortOrder
 from tests.daily.connectors.utils import load_all_docs_from_checkpoint_connector
 
 
@@ -101,7 +103,17 @@ def test_confluence_connector_restriction_handling(
     mock_cc_pair.credential_id = 1
 
     # Call the confluence_doc_sync function directly with the mock cc_pair
-    doc_access_generator = confluence_doc_sync(mock_cc_pair, lambda: [], None)
+    def mock_fetch_all_docs_fn(
+        sort_order: SortOrder | None = None,
+    ) -> list[DocumentRow]:
+        return []
+
+    def mock_fetch_all_docs_ids_fn() -> list[str]:
+        return []
+
+    doc_access_generator = confluence_doc_sync(
+        mock_cc_pair, mock_fetch_all_docs_fn, mock_fetch_all_docs_ids_fn, None
+    )
     doc_access_list = list(doc_access_generator)
     assert len(doc_access_list) == 7
     assert all(
diff --git a/backend/tests/daily/connectors/google_drive/test_drive_perm_sync.py b/backend/tests/daily/connectors/google_drive/test_drive_perm_sync.py
index ea8517e9356..b8325cfdb08 100644
--- a/backend/tests/daily/connectors/google_drive/test_drive_perm_sync.py
+++ b/backend/tests/daily/connectors/google_drive/test_drive_perm_sync.py
@@ -10,6 +10,8 @@
 from ee.onyx.external_permissions.google_drive.group_sync import gdrive_group_sync
 from onyx.connectors.google_drive.connector import GoogleDriveConnector
 from onyx.db.models import ConnectorCredentialPair
+from onyx.db.utils import DocumentRow
+from onyx.db.utils import SortOrder
 from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
 from tests.daily.connectors.google_drive.consts_and_utils import ACCESS_MAPPING
 from tests.daily.connectors.google_drive.consts_and_utils import ADMIN_EMAIL
@@ -71,7 +73,20 @@ def test_gdrive_perm_sync_with_real_data(
         return_value=_build_connector(google_drive_service_acct_connector_factory),
     ):
         # Call the function under test
-        doc_access_generator = gdrive_doc_sync(mock_cc_pair, lambda: [], mock_heartbeat)
+        def mock_fetch_all_docs_fn(
+            sort_order: SortOrder | None = None,
+        ) -> list[DocumentRow]:
+            return []
+
+        def mock_fetch_all_docs_ids_fn() -> list[str]:
+            return []
+
+        doc_access_generator = gdrive_doc_sync(
+            mock_cc_pair,
+            mock_fetch_all_docs_fn,
+            mock_fetch_all_docs_ids_fn,
+            mock_heartbeat,
+        )
         doc_access_list = list(doc_access_generator)
 
     # Verify we got some results
diff --git a/backend/tests/daily/connectors/utils.py b/backend/tests/daily/connectors/utils.py
index 388d567580d..27cef3182fe 100644
--- a/backend/tests/daily/connectors/utils.py
+++ b/backend/tests/daily/connectors/utils.py
@@ -33,7 +33,7 @@ def _load_all_docs(
         for document, failure, next_checkpoint in doc_batch_generator:
             if failure is not None:
                 raise RuntimeError(f"Failed to load documents: {failure}")
-            if document is not None:
+            if document is not None and isinstance(document, Document):
                 documents.append(document)
             if next_checkpoint is not None:
                 checkpoint = next_checkpoint
@@ -100,7 +100,7 @@ def load_everything_from_checkpoint_connector(
         for document, failure, next_checkpoint in doc_batch_generator:
             if failure is not None:
                 outputs.append(failure)
-            if document is not None:
+            if document is not None and isinstance(document, Document):
                 outputs.append(document)
             if next_checkpoint is not None:
                 checkpoint = next_checkpoint
diff --git a/backend/tests/unit/onyx/connectors/github/test_github_checkpointing.py b/backend/tests/unit/onyx/connectors/github/test_github_checkpointing.py
index e79f8f89a7a..e19932354fb 100644
--- a/backend/tests/unit/onyx/connectors/github/test_github_checkpointing.py
+++ b/backend/tests/unit/onyx/connectors/github/test_github_checkpointing.py
@@ -23,7 +23,7 @@
 from onyx.connectors.exceptions import InsufficientPermissionsError
 from onyx.connectors.github.connector import GithubConnector
 from onyx.connectors.github.connector import GithubConnectorStage
-from onyx.connectors.github.connector import SerializedRepository
+from onyx.connectors.github.models import SerializedRepository
 from onyx.connectors.models import Document
 from tests.unit.onyx.connectors.utils import load_everything_from_checkpoint_connector
 from tests.unit.onyx.connectors.utils import (
@@ -97,6 +97,10 @@ def _create_mock_pr(
             else f"https://github.com/test-org/test-repo/pull/{number}"
         )
         mock_pr.raw_data = {}
+        mock_pr.base = MagicMock()
+        mock_pr.base.repo = MagicMock()
+        mock_pr.base.repo.full_name = "test-org/test-repo"
+
         return mock_pr
 
     return _create_mock_pr
@@ -121,6 +125,11 @@ def _create_mock_issue(
         mock_issue.html_url = f"https://github.com/test-org/test-repo/issues/{number}"
         mock_issue.pull_request = None  # Not a PR
         mock_issue.raw_data = {}
+
+        # Mock the nested base.repo.full_name attribute
+        mock_issue.repository = MagicMock()
+        mock_issue.repository.full_name = "test-org/test-repo"
+
         return mock_issue
 
     return _create_mock_issue
@@ -265,7 +274,7 @@ def test_load_from_checkpoint_with_rate_limit(
         # Call load_from_checkpoint
         end_time = time.time()
         with patch(
-            "onyx.connectors.github.connector._sleep_after_rate_limit_exception"
+            "onyx.connectors.github.connector.sleep_after_rate_limit_exception"
         ) as mock_sleep:
             outputs = load_everything_from_checkpoint_connector(
                 github_connector, 0, end_time
@@ -797,7 +806,7 @@ def to_repository_side_effect(
     mock_repo1.get_issues.return_value = mock_empty_issues_list
     mock_repo2.get_issues.return_value = mock_empty_issues_list
     with patch.object(
-        github_connector, "_get_all_repos", return_value=[mock_repo1, mock_repo2]
+        github_connector, "get_all_repos", return_value=[mock_repo1, mock_repo2]
     ), patch.object(
         github_connector,
         "_pull_requests_func",
diff --git a/backend/tests/unit/onyx/connectors/jira/test_jira_permission_sync.py b/backend/tests/unit/onyx/connectors/jira/test_jira_permission_sync.py
index 5904b63d79f..aff2d44d91b 100644
--- a/backend/tests/unit/onyx/connectors/jira/test_jira_permission_sync.py
+++ b/backend/tests/unit/onyx/connectors/jira/test_jira_permission_sync.py
@@ -34,10 +34,16 @@ def mock_fetch_all_existing_docs_fn() -> MagicMock:
     return MagicMock(return_value=[])
 
 
+@pytest.fixture
+def mock_fetch_all_existing_docs_ids_fn() -> MagicMock:
+    return MagicMock(return_value=[])
+
+
 def test_jira_permission_sync(
     jira_connector: JiraConnector,
     mock_jira_cc_pair: MagicMock,
     mock_fetch_all_existing_docs_fn: MagicMock,
+    mock_fetch_all_existing_docs_ids_fn: MagicMock,
 ) -> None:
     with patch("onyx.connectors.jira.connector.build_jira_client") as mock_build_client:
         mock_build_client.return_value = jira_connector._jira_client
@@ -45,5 +51,6 @@ def test_jira_permission_sync(
         for doc in jira_doc_sync(
             cc_pair=mock_jira_cc_pair,
             fetch_all_existing_docs_fn=mock_fetch_all_existing_docs_fn,
+            fetch_all_existing_docs_ids_fn=mock_fetch_all_existing_docs_ids_fn,
         ):
             print(doc)
diff --git a/web/src/lib/connectors/AutoSyncOptionFields.tsx b/web/src/lib/connectors/AutoSyncOptionFields.tsx
index 64b1725161f..40e6cc4082a 100644
--- a/web/src/lib/connectors/AutoSyncOptionFields.tsx
+++ b/web/src/lib/connectors/AutoSyncOptionFields.tsx
@@ -14,6 +14,7 @@ export const autoSyncConfigBySource: Record<
   confluence: {},
   google_drive: {},
   gmail: {},
+  github: {},
   slack: {},
   salesforce: {},
 };
diff --git a/web/src/lib/types.ts b/web/src/lib/types.ts
index 1555f3b20a8..22b6c8eb966 100644
--- a/web/src/lib/types.ts
+++ b/web/src/lib/types.ts
@@ -455,6 +455,7 @@ export const validAutoSyncSources = [
   ValidSources.Gmail,
   ValidSources.Slack,
   ValidSources.Salesforce,
+  ValidSources.GitHub,
 ] as const;
 
 // Create a type from the array elements

From 5430ef277a6df262650bcc238a46852c0a0fff76 Mon Sep 17 00:00:00 2001
From: Rei Meguro <36625832+Orbital-Web@users.noreply.github.com>
Date: Wed, 30 Jul 2025 08:08:39 -0700
Subject: [PATCH 46/78] edit link to custom actions (#5129)

---
 web/src/app/admin/actions/ActionEditor.tsx | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/web/src/app/admin/actions/ActionEditor.tsx b/web/src/app/admin/actions/ActionEditor.tsx
index d39aef2786a..5b80168231e 100644
--- a/web/src/app/admin/actions/ActionEditor.tsx
+++ b/web/src/app/admin/actions/ActionEditor.tsx
@@ -123,15 +123,15 @@ function ActionForm({
         <button
           type="button"
           className="
-            absolute 
-            bottom-4 
+            absolute
+            bottom-4
             right-4
             border-border
             border
             bg-background
             rounded
-            py-1 
-            px-3 
+            py-1
+            px-3
             text-sm
             hover:bg-accent-background
           "
@@ -162,7 +162,7 @@ function ActionForm({
       />
       <div className="mt-4 text-sm bg-blue-50 text-blue-700 dark:text-blue-300 dark:bg-blue-900 p-4 rounded-md border border-blue-200 dark:border-blue-800">
         <Link
-          href="https://docs.onyx.app/tools/custom"
+          href="https://docs.onyx.app/actions/custom#custom-actions"
           className="text-link hover:underline flex items-center"
           target="_blank"
           rel="noopener noreferrer"

From b268c483ab0baf650ca0ae5ef5d34e92d59dc458 Mon Sep 17 00:00:00 2001
From: Raunak Bhagat <r@rabh.io>
Date: Wed, 30 Jul 2025 13:46:30 -0700
Subject: [PATCH 47/78] fix: Make ungrounded types have a default name when
 sending to the frontend (#5133)

* Update names in map-comprehension

* Make default name for ungrounded types public

* Return the default name for ungrounded entity-types

* Update backend/onyx/db/entities.py

Co-authored-by: cubic-dev-ai[bot] <191113872+cubic-dev-ai[bot]@users.noreply.github.com>

---------

Co-authored-by: cubic-dev-ai[bot] <191113872+cubic-dev-ai[bot]@users.noreply.github.com>
---
 backend/onyx/db/entities.py    |  9 ++++++++-
 backend/onyx/db/entity_type.py |  4 ++--
 backend/onyx/server/kg/api.py  | 14 +++++++++-----
 3 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/backend/onyx/db/entities.py b/backend/onyx/db/entities.py
index 9696ce92673..ade11bfd51b 100644
--- a/backend/onyx/db/entities.py
+++ b/backend/onyx/db/entities.py
@@ -12,6 +12,7 @@
 from sqlalchemy.orm import Session
 
 import onyx.db.document as dbdocument
+from onyx.db.entity_type import UNGROUNDED_SOURCE_NAME
 from onyx.db.models import Document
 from onyx.db.models import KGEntity
 from onyx.db.models import KGEntityExtractionStaging
@@ -328,7 +329,13 @@ def get_entity_stats_by_grounded_source_name(
         .group_by(KGEntityType.grounded_source_name)
         .all()
     )
+
+    # `row.grounded_source_name` is NULLABLE in the database schema.
+    # Thus, for all "ungrounded" entity-types, we use a default name.
     return {
-        row.grounded_source_name: (row.last_updated, row.entities_count)
+        (row.grounded_source_name or UNGROUNDED_SOURCE_NAME): (
+            row.last_updated,
+            row.entities_count,
+        )
         for row in results
     }
diff --git a/backend/onyx/db/entity_type.py b/backend/onyx/db/entity_type.py
index 54b7bfaff35..a88ac9c2ace 100644
--- a/backend/onyx/db/entity_type.py
+++ b/backend/onyx/db/entity_type.py
@@ -11,7 +11,7 @@
 from onyx.server.kg.models import EntityType
 
 
-_UNGROUNDED_SOURCE_NAME = "Ungrounded"
+UNGROUNDED_SOURCE_NAME = "Ungrounded"
 
 
 def get_entity_types_with_grounded_source_name(
@@ -87,7 +87,7 @@ def get_configured_entity_types(db_session: Session) -> dict[str, list[KGEntityT
 
     et_map = defaultdict(list)
     for et in ets:
-        key = et.grounded_source_name or _UNGROUNDED_SOURCE_NAME
+        key = et.grounded_source_name or UNGROUNDED_SOURCE_NAME
         et_map[key].append(et)
 
     return et_map
diff --git a/backend/onyx/server/kg/api.py b/backend/onyx/server/kg/api.py
index 56081997424..8d15e2c24da 100644
--- a/backend/onyx/server/kg/api.py
+++ b/backend/onyx/server/kg/api.py
@@ -179,15 +179,19 @@ def get_kg_entity_types(
 ) -> SourceAndEntityTypeView:
     # when using for the first time, populate with default entity types
     entity_types = {
-        key: [EntityType.from_model(et) for et in ets]
-        for key, ets in get_configured_entity_types(db_session=db_session).items()
+        source_name: [EntityType.from_model(et) for et in ets]
+        for source_name, ets in get_configured_entity_types(
+            db_session=db_session
+        ).items()
     }
 
     source_statistics = {
-        key: SourceStatistics(
-            source_name=key, last_updated=last_updated, entities_count=entities_count
+        source_name: SourceStatistics(
+            source_name=source_name,
+            last_updated=last_updated,
+            entities_count=entities_count,
         )
-        for key, (
+        for source_name, (
             last_updated,
             entities_count,
         ) in get_entity_stats_by_grounded_source_name(db_session=db_session).items()

From b769bd1c1e78378be93f96988aa32226c3caf507 Mon Sep 17 00:00:00 2001
From: Wenxi <wenxi@onyx.app>
Date: Wed, 30 Jul 2025 15:21:16 -0700
Subject: [PATCH 48/78] fix: accept multiple zip types and fallback to
 extension (#5135)

* accept multiple zip types and fallback to extension

* move zip check to util

* mypy nit
---
 backend/onyx/server/documents/connector.py | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/backend/onyx/server/documents/connector.py b/backend/onyx/server/documents/connector.py
index 317a1d37a42..4dffced28cb 100644
--- a/backend/onyx/server/documents/connector.py
+++ b/backend/onyx/server/documents/connector.py
@@ -418,6 +418,26 @@ def extract_zip_metadata(zf: zipfile.ZipFile) -> dict[str, Any]:
     return zip_metadata
 
 
+def is_zip_file(file: UploadFile) -> bool:
+    """
+    Check if the file is a zip file by content type or filename.
+    """
+    return bool(
+        (
+            file.content_type
+            and file.content_type.startswith(
+                (
+                    "application/zip",
+                    "application/x-zip-compressed",  # May be this in Windows
+                    "application/x-zip",
+                    "multipart/x-zip",
+                )
+            )
+        )
+        or (file.filename and file.filename.lower().endswith(".zip"))
+    )
+
+
 def upload_files(files: list[UploadFile]) -> FileUploadResponse:
     for file in files:
         if not file.filename:
@@ -434,7 +454,7 @@ def should_process_file(file_path: str) -> bool:
         file_store = get_default_file_store()
         seen_zip = False
         for file in files:
-            if file.content_type and file.content_type.startswith("application/zip"):
+            if is_zip_file(file):
                 if seen_zip:
                     raise HTTPException(status_code=400, detail=SEEN_ZIP_DETAIL)
                 seen_zip = True

From 7533da2f9513acd7570cf7d04d3fdf2e4b572c18 Mon Sep 17 00:00:00 2001
From: Justin Tahara <105671973+justin-tahara@users.noreply.github.com>
Date: Thu, 31 Jul 2025 16:05:46 -0700
Subject: [PATCH 49/78] feat(infra): Codeowner for Helm directory (#5139)

---
 .github/CODEOWNERS | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 9ba0e16ceca..a66befe281b 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1 +1,3 @@
 * @onyx-dot-app/onyx-core-team
+# Helm charts Owners
+/helm/ @justin-tahara

From 35fc4b6c68731073ba56c814d019105e3d03b5a5 Mon Sep 17 00:00:00 2001
From: Justin Tahara <105671973+justin-tahara@users.noreply.github.com>
Date: Fri, 1 Aug 2025 09:26:58 -0700
Subject: [PATCH 50/78] feat(infra): Creating new helm chart action workflow
 (#5137)

* feat(infra) Creating new helm chart action workflow

* Adding the steps

* Adding in dependencies

* One more debug

* Adding a new step to install helm
---
 .github/workflows/helm-chart-releases.yml | 40 +++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 .github/workflows/helm-chart-releases.yml

diff --git a/.github/workflows/helm-chart-releases.yml b/.github/workflows/helm-chart-releases.yml
new file mode 100644
index 00000000000..0489da8e268
--- /dev/null
+++ b/.github/workflows/helm-chart-releases.yml
@@ -0,0 +1,40 @@
+name: Release Onyx Helm Charts
+
+on:
+  push:
+    branches:
+      - main
+
+permissions: write-all
+
+jobs:
+  release:
+    permissions:
+      contents: write
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Configure Git
+        run: |
+          git config user.name "$GITHUB_ACTOR"
+          git config user.email "$GITHUB_ACTOR@users.noreply.github.com"
+
+      - name: Install Helm
+        uses: azure/setup-helm@v4
+        with:
+          version: v3.12.1
+
+      - name: Add Required Helm Repositories
+        run: |
+          helm repo add bitnami https://charts.bitnami.com/bitnami
+          helm repo add onyx-vespa https://onyx-dot-app.github.io/vespa-helm-charts
+          helm repo update
+
+      - name: Run chart-releaser
+        uses: helm/chart-releaser-action@v1.7.0
+        env:
+          CR_TOKEN: "${{ secrets.GITHUB_TOKEN }}"
\ No newline at end of file

From 2003410d9e1590c9eb99fe8fad1deb4b647b503c Mon Sep 17 00:00:00 2001
From: Wenxi <wenxi@onyx.app>
Date: Fri, 1 Aug 2025 13:31:29 -0700
Subject: [PATCH 51/78] fix: minio file names (#5138)

* nit var clarity

* maintain file names in connector config for display

* remove unused util

* migration draft

* optional file names to not break existing instances

* backwards compatible

* backwards compatible

* migration logging

* update file ocnn tests

* unncessary none

* mypy + explanatory comments
---
 ...add_file_names_to_file_connector_config.py | 132 ++++++++++++++++++
 backend/onyx/connectors/file/connector.py     |  14 +-
 backend/onyx/db/user_documents.py             |   1 +
 backend/onyx/server/documents/connector.py    |  13 +-
 backend/onyx/server/documents/models.py       |   1 +
 backend/onyx/server/manage/administrative.py  |   4 +-
 .../server/query_and_chat/chat_backend.py     |   1 +
 backend/onyx/server/user_documents/api.py     |   1 +
 .../scripts/force_delete_connector_by_id.py   |  10 +-
 .../connectors/file/test_file_connector.py    |   8 +-
 .../common_utils/managers/connector.py        |   2 +-
 .../image_indexing/test_indexing_images.py    |   6 +-
 .../test_file_connector_zip_metadata.py       |   1 +
 .../tests/kg/test_kg_processing.py            |   6 +-
 .../regression/answer_quality/api_utils.py    |   6 +-
 .../connector/[ccPairId]/ConfigDisplay.tsx    |  10 +-
 .../[connector]/pages/utils/files.ts          |   2 +
 web/src/lib/connectors/connectors.tsx         |   3 +-
 web/src/lib/fileUtils.ts                      |   4 -
 19 files changed, 201 insertions(+), 24 deletions(-)
 create mode 100644 backend/alembic/versions/62c3a055a141_add_file_names_to_file_connector_config.py
 delete mode 100644 web/src/lib/fileUtils.ts

diff --git a/backend/alembic/versions/62c3a055a141_add_file_names_to_file_connector_config.py b/backend/alembic/versions/62c3a055a141_add_file_names_to_file_connector_config.py
new file mode 100644
index 00000000000..486defd86eb
--- /dev/null
+++ b/backend/alembic/versions/62c3a055a141_add_file_names_to_file_connector_config.py
@@ -0,0 +1,132 @@
+"""add file names to file connector config
+
+Revision ID: 62c3a055a141
+Revises: 3fc5d75723b3
+Create Date: 2025-07-30 17:01:24.417551
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+import json
+import os
+import logging
+
+
+# revision identifiers, used by Alembic.
+revision = "62c3a055a141"
+down_revision = "3fc5d75723b3"
+branch_labels = None
+depends_on = None
+
+SKIP_FILE_NAME_MIGRATION = (
+    os.environ.get("SKIP_FILE_NAME_MIGRATION", "true").lower() == "true"
+)
+
+logger = logging.getLogger("alembic.runtime.migration")
+
+
+def upgrade() -> None:
+    if SKIP_FILE_NAME_MIGRATION:
+        logger.info(
+            "Skipping file name migration. Hint: set SKIP_FILE_NAME_MIGRATION=false to run this migration"
+        )
+        return
+    logger.info("Running file name migration")
+    # Get connection
+    conn = op.get_bind()
+
+    # Get all FILE connectors with their configs
+    file_connectors = conn.execute(
+        sa.text(
+            """
+            SELECT id, connector_specific_config
+            FROM connector
+            WHERE source = 'FILE'
+        """
+        )
+    ).fetchall()
+
+    for connector_id, config in file_connectors:
+        # Parse config if it's a string
+        if isinstance(config, str):
+            config = json.loads(config)
+
+        # Get file_locations list
+        file_locations = config.get("file_locations", [])
+
+        # Get display names for each file_id
+        file_names = []
+        for file_id in file_locations:
+            result = conn.execute(
+                sa.text(
+                    """
+                    SELECT display_name
+                    FROM file_record
+                    WHERE file_id = :file_id
+                """
+                ),
+                {"file_id": file_id},
+            ).fetchone()
+
+            if result:
+                file_names.append(result[0])
+            else:
+                file_names.append(file_id)  # Should not happen
+
+        # Add file_names to config
+        new_config = dict(config)
+        new_config["file_names"] = file_names
+
+        # Update the connector
+        conn.execute(
+            sa.text(
+                """
+                UPDATE connector
+                SET connector_specific_config = :new_config
+                WHERE id = :connector_id
+            """
+            ),
+            {"connector_id": connector_id, "new_config": json.dumps(new_config)},
+        )
+
+
+def downgrade() -> None:
+    # Get connection
+    conn = op.get_bind()
+
+    # Remove file_names from all FILE connectors
+    file_connectors = conn.execute(
+        sa.text(
+            """
+            SELECT id, connector_specific_config
+            FROM connector
+            WHERE source = 'FILE'
+        """
+        )
+    ).fetchall()
+
+    for connector_id, config in file_connectors:
+        # Parse config if it's a string
+        if isinstance(config, str):
+            config = json.loads(config)
+
+        # Remove file_names if it exists
+        if "file_names" in config:
+            new_config = dict(config)
+            del new_config["file_names"]
+
+            # Update the connector
+            conn.execute(
+                sa.text(
+                    """
+                    UPDATE connector
+                    SET connector_specific_config = :new_config
+                    WHERE id = :connector_id
+                """
+                ),
+                {
+                    "connector_id": connector_id,
+                    "new_config": json.dumps(new_config),
+                },
+            )
diff --git a/backend/onyx/connectors/file/connector.py b/backend/onyx/connectors/file/connector.py
index 7f8ae80d204..58d64657019 100644
--- a/backend/onyx/connectors/file/connector.py
+++ b/backend/onyx/connectors/file/connector.py
@@ -222,11 +222,21 @@ class LocalFileConnector(LoadConnector):
     """
     Connector that reads files from Postgres and yields Documents, including
     embedded image extraction without summarization.
+
+    file_locations are S3/Filestore UUIDs
+    file_names are the names of the files
     """
 
+    # Note: file_names is a required parameter, but should not break backwards compatibility.
+    # If add_file_names migration is not run, old file connector configs will not have file_names.
+    # This is fine because the configs are not re-used to instantiate the connector.
+    # file_names is only used for display purposes in the UI and file_locations is used as a fallback.
     def __init__(
         self,
         file_locations: list[Path | str],
+        file_names: list[
+            str
+        ],  # Must accept this parameter as connector_specific_config is unpacked as args
         zip_metadata: dict[str, Any],
         batch_size: int = INDEX_BATCH_SIZE,
     ) -> None:
@@ -282,7 +292,9 @@ def load_from_state(self) -> GenerateDocumentsOutput:
 
 if __name__ == "__main__":
     connector = LocalFileConnector(
-        file_locations=[os.environ["TEST_FILE"]], zip_metadata={}
+        file_locations=[os.environ["TEST_FILE"]],
+        file_names=[os.environ["TEST_FILE"]],
+        zip_metadata={},
     )
     connector.load_credentials({"pdf_password": os.environ.get("PDF_PASSWORD")})
     doc_batches = connector.load_from_state()
diff --git a/backend/onyx/db/user_documents.py b/backend/onyx/db/user_documents.py
index 2959dc212cc..f8e8f452f08 100644
--- a/backend/onyx/db/user_documents.py
+++ b/backend/onyx/db/user_documents.py
@@ -115,6 +115,7 @@ def create_file_connector_credential(
         input_type=InputType.LOAD_STATE,
         connector_specific_config={
             "file_locations": [user_file.file_id],
+            "file_names": [user_file.name],
             "zip_metadata": {},
         },
         refresh_freq=None,
diff --git a/backend/onyx/server/documents/connector.py b/backend/onyx/server/documents/connector.py
index 4dffced28cb..eb6a3ca57b8 100644
--- a/backend/onyx/server/documents/connector.py
+++ b/backend/onyx/server/documents/connector.py
@@ -449,6 +449,7 @@ def should_process_file(file_path: str) -> bool:
         return not any(part.startswith(".") for part in normalized_path.split(os.sep))
 
     deduped_file_paths = []
+    deduped_file_names = []
     zip_metadata = {}
     try:
         file_store = get_default_file_store()
@@ -480,14 +481,19 @@ def should_process_file(file_path: str) -> bool:
                             file_type=mime_type,
                         )
                         deduped_file_paths.append(file_id)
+                        deduped_file_names.append(os.path.basename(file_info))
                 continue
 
+            # For mypy, actual check happens at start of function
+            assert file.filename is not None
+
             # Special handling for docx files - only store the plaintext version
             if file.content_type and file.content_type.startswith(
                 "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
             ):
                 docx_file_id = convert_docx_to_txt(file, file_store)
                 deduped_file_paths.append(docx_file_id)
+                deduped_file_names.append(file.filename)
                 continue
 
             # Default handling for all other file types
@@ -498,10 +504,15 @@ def should_process_file(file_path: str) -> bool:
                 file_type=file.content_type or "text/plain",
             )
             deduped_file_paths.append(file_id)
+            deduped_file_names.append(file.filename)
 
     except ValueError as e:
         raise HTTPException(status_code=400, detail=str(e))
-    return FileUploadResponse(file_paths=deduped_file_paths, zip_metadata=zip_metadata)
+    return FileUploadResponse(
+        file_paths=deduped_file_paths,
+        file_names=deduped_file_names,
+        zip_metadata=zip_metadata,
+    )
 
 
 @router.post("/admin/connector/file/upload")
diff --git a/backend/onyx/server/documents/models.py b/backend/onyx/server/documents/models.py
index b13f80ffb5c..c82717b68f8 100644
--- a/backend/onyx/server/documents/models.py
+++ b/backend/onyx/server/documents/models.py
@@ -475,6 +475,7 @@ class GoogleServiceAccountCredentialRequest(BaseModel):
 
 class FileUploadResponse(BaseModel):
     file_paths: list[str]
+    file_names: list[str]
     zip_metadata: dict[str, Any]
 
 
diff --git a/backend/onyx/server/manage/administrative.py b/backend/onyx/server/manage/administrative.py
index 43b80e93a85..4f9d9b01dfc 100644
--- a/backend/onyx/server/manage/administrative.py
+++ b/backend/onyx/server/manage/administrative.py
@@ -206,5 +206,5 @@ def create_deletion_attempt_for_connector_id(
     if cc_pair.connector.source == DocumentSource.FILE:
         connector = cc_pair.connector
         file_store = get_default_file_store()
-        for file_name in connector.connector_specific_config.get("file_locations", []):
-            file_store.delete_file(file_name)
+        for file_id in connector.connector_specific_config.get("file_locations", []):
+            file_store.delete_file(file_id)
diff --git a/backend/onyx/server/query_and_chat/chat_backend.py b/backend/onyx/server/query_and_chat/chat_backend.py
index db924d09dd0..a1e60f7af22 100644
--- a/backend/onyx/server/query_and_chat/chat_backend.py
+++ b/backend/onyx/server/query_and_chat/chat_backend.py
@@ -775,6 +775,7 @@ def upload_files_for_chat(
                 input_type=InputType.LOAD_STATE,
                 connector_specific_config={
                     "file_locations": [user_file.file_id],
+                    "file_names": [user_file.name],
                     "zip_metadata": {},
                 },
                 refresh_freq=None,
diff --git a/backend/onyx/server/user_documents/api.py b/backend/onyx/server/user_documents/api.py
index 0e28fda8e52..58313ff4ee5 100644
--- a/backend/onyx/server/user_documents/api.py
+++ b/backend/onyx/server/user_documents/api.py
@@ -408,6 +408,7 @@ def create_file_from_link(
                 input_type=InputType.LOAD_STATE,
                 connector_specific_config={
                     "file_locations": [user_file.file_id],
+                    "file_names": [user_file.name],
                     "zip_metadata": {},
                 },
                 refresh_freq=None,
diff --git a/backend/scripts/force_delete_connector_by_id.py b/backend/scripts/force_delete_connector_by_id.py
index d9e563561ad..fea82ed36af 100755
--- a/backend/scripts/force_delete_connector_by_id.py
+++ b/backend/scripts/force_delete_connector_by_id.py
@@ -187,7 +187,7 @@ def _delete_connector(cc_pair_id: int, db_session: Session) -> None:
             f"{connector_id} and Credential ID: {credential_id} does not exist."
         )
 
-    file_names: list[str] = (
+    file_ids: list[str] = (
         cc_pair.connector.connector_specific_config["file_locations"]
         if cc_pair.connector.source == DocumentSource.FILE
         else []
@@ -211,12 +211,12 @@ def _delete_connector(cc_pair_id: int, db_session: Session) -> None:
     except Exception as e:
         logger.error(f"Failed to delete connector due to {e}")
 
-    if file_names:
+    if file_ids:
         logger.notice("Deleting stored files!")
         file_store = get_default_file_store()
-        for file_name in file_names:
-            logger.notice(f"Deleting file {file_name}")
-            file_store.delete_file(file_name)
+        for file_id in file_ids:
+            logger.notice(f"Deleting file {file_id}")
+            file_store.delete_file(file_id)
 
     db_session.commit()
 
diff --git a/backend/tests/daily/connectors/file/test_file_connector.py b/backend/tests/daily/connectors/file/test_file_connector.py
index a94e90fa3ca..11b424f7996 100644
--- a/backend/tests/daily/connectors/file/test_file_connector.py
+++ b/backend/tests/daily/connectors/file/test_file_connector.py
@@ -56,7 +56,9 @@ def test_single_text_file_with_metadata(
         "onyx.connectors.file.connector.get_default_file_store",
         return_value=mock_file_store,
     ):
-        connector = LocalFileConnector(file_locations=["test.txt"], zip_metadata={})
+        connector = LocalFileConnector(
+            file_locations=["test.txt"], file_names=["test.txt"], zip_metadata={}
+        )
         batches = list(connector.load_from_state())
 
     assert len(batches) == 1
@@ -113,7 +115,9 @@ def test_two_text_files_with_zip_metadata(
         return_value=mock_file_store,
     ):
         connector = LocalFileConnector(
-            file_locations=["file1.txt", "file2.txt"], zip_metadata=zip_metadata
+            file_locations=["file1.txt", "file2.txt"],
+            file_names=["file1.txt", "file2.txt"],
+            zip_metadata=zip_metadata,
         )
         batches = list(connector.load_from_state())
 
diff --git a/backend/tests/integration/common_utils/managers/connector.py b/backend/tests/integration/common_utils/managers/connector.py
index ea7ddd145bc..bb2ae40a14e 100644
--- a/backend/tests/integration/common_utils/managers/connector.py
+++ b/backend/tests/integration/common_utils/managers/connector.py
@@ -34,7 +34,7 @@ def create(
             connector_specific_config=(
                 connector_specific_config
                 or (
-                    {"file_locations": [], "zip_metadata": {}}
+                    {"file_locations": [], "file_names": [], "zip_metadata": {}}
                     if source == DocumentSource.FILE
                     else {}
                 )
diff --git a/backend/tests/integration/tests/image_indexing/test_indexing_images.py b/backend/tests/integration/tests/image_indexing/test_indexing_images.py
index 451cde5d9a4..499aa03d984 100644
--- a/backend/tests/integration/tests/image_indexing/test_indexing_images.py
+++ b/backend/tests/integration/tests/image_indexing/test_indexing_images.py
@@ -67,7 +67,11 @@ def test_image_indexing(
         name=connector_name,
         source=DocumentSource.FILE,
         input_type=InputType.LOAD_STATE,
-        connector_specific_config={"file_locations": file_paths, "zip_metadata": {}},
+        connector_specific_config={
+            "file_locations": file_paths,
+            "file_names": [FILE_NAME],
+            "zip_metadata": {},
+        },
         access_type=AccessType.PUBLIC,
         groups=[],
         user_performing_action=admin_user,
diff --git a/backend/tests/integration/tests/indexing/file_connector/test_file_connector_zip_metadata.py b/backend/tests/integration/tests/indexing/file_connector/test_file_connector_zip_metadata.py
index 3c4d7d10346..19776e76f7f 100644
--- a/backend/tests/integration/tests/indexing/file_connector/test_file_connector_zip_metadata.py
+++ b/backend/tests/integration/tests/indexing/file_connector/test_file_connector_zip_metadata.py
@@ -76,6 +76,7 @@ def test_zip_metadata_handling(
         input_type=InputType.LOAD_STATE,
         connector_specific_config={
             "file_locations": file_paths,
+            "file_names": [os.path.basename(file_path) for file_path in file_paths],
             "zip_metadata": metadata,
         },
         access_type=AccessType.PUBLIC,
diff --git a/backend/tests/integration/tests/kg/test_kg_processing.py b/backend/tests/integration/tests/kg/test_kg_processing.py
index 5e430358204..b269817f76a 100644
--- a/backend/tests/integration/tests/kg/test_kg_processing.py
+++ b/backend/tests/integration/tests/kg/test_kg_processing.py
@@ -65,7 +65,11 @@ def kg_test_docs() -> tuple[list[str], int, list[KGEntityType]]:
         name="KG-Test-FileConnector",
         source=DocumentSource.FILE,
         input_type=InputType.LOAD_STATE,
-        connector_specific_config={"file_locations": [], "zip_metadata": {}},
+        connector_specific_config={
+            "file_locations": [],
+            "file_names": [],
+            "zip_metadata": {},
+        },
         user_performing_action=admin_user,
     )
     api_key = APIKeyManager.create(user_performing_action=admin_user)
diff --git a/backend/tests/regression/answer_quality/api_utils.py b/backend/tests/regression/answer_quality/api_utils.py
index 7390b24059b..da4659a460d 100644
--- a/backend/tests/regression/answer_quality/api_utils.py
+++ b/backend/tests/regression/answer_quality/api_utils.py
@@ -160,7 +160,11 @@ def create_connector(env_name: str, file_paths: list[str]) -> int:
         name=connector_name,
         source=DocumentSource.FILE,
         input_type=InputType.LOAD_STATE,
-        connector_specific_config={"file_locations": file_paths, "zip_metadata": {}},
+        connector_specific_config={
+            "file_locations": file_paths,
+            "file_names": [],  # For regression tests, no need for file_names
+            "zip_metadata": {},
+        },
         refresh_freq=None,
         prune_freq=None,
         indexing_start=None,
diff --git a/web/src/app/admin/connector/[ccPairId]/ConfigDisplay.tsx b/web/src/app/admin/connector/[ccPairId]/ConfigDisplay.tsx
index b613a52eae8..94f3235f946 100644
--- a/web/src/app/admin/connector/[ccPairId]/ConfigDisplay.tsx
+++ b/web/src/app/admin/connector/[ccPairId]/ConfigDisplay.tsx
@@ -1,4 +1,3 @@
-import { getNameFromPath } from "@/lib/fileUtils";
 import { ValidSources } from "@/lib/types";
 import { EditIcon } from "@/components/icons/icons";
 
@@ -29,11 +28,14 @@ export function buildConfigEntries(
   sourceType: ValidSources
 ): { [key: string]: string } {
   if (sourceType === ValidSources.File) {
-    return obj.file_locations
+    return obj.file_names
       ? {
-          file_names: obj.file_locations.map(getNameFromPath),
+          file_names: obj.file_names,
         }
-      : {};
+      : {
+          // For deployments that don't run file_names migration
+          file_locations: obj.file_locations,
+        };
   } else if (sourceType === ValidSources.GoogleSites) {
     return {
       base_url: obj.base_url,
diff --git a/web/src/app/admin/connectors/[connector]/pages/utils/files.ts b/web/src/app/admin/connectors/[connector]/pages/utils/files.ts
index 12fbb4c38a6..c6ba18ae2a4 100644
--- a/web/src/app/admin/connectors/[connector]/pages/utils/files.ts
+++ b/web/src/app/admin/connectors/[connector]/pages/utils/files.ts
@@ -31,6 +31,7 @@ export const submitFiles = async (
   }
 
   const filePaths = responseJson.file_paths as string[];
+  const fileNames = responseJson.file_names as string[];
   const zipMetadata = responseJson.zip_metadata as Record<string, any>;
 
   const [connectorErrorMsg, connector] = await createConnector<FileConfig>({
@@ -39,6 +40,7 @@ export const submitFiles = async (
     input_type: "load_state",
     connector_specific_config: {
       file_locations: filePaths,
+      file_names: fileNames,
       zip_metadata: zipMetadata,
     },
     refresh_freq: null,
diff --git a/web/src/lib/connectors/connectors.tsx b/web/src/lib/connectors/connectors.tsx
index 04ca5074a9e..308d53956cf 100644
--- a/web/src/lib/connectors/connectors.tsx
+++ b/web/src/lib/connectors/connectors.tsx
@@ -766,7 +766,7 @@ For example, specifying .*-support.* as a "channel" will cause the connector to
       {
         type: "file",
         query: "Enter file locations:",
-        label: "File Locations",
+        label: "Files",
         name: "file_locations",
         optional: false,
       },
@@ -1577,6 +1577,7 @@ export interface LoopioConfig {
 
 export interface FileConfig {
   file_locations: string[];
+  file_names: string[];
   zip_metadata: Record<string, any>;
 }
 
diff --git a/web/src/lib/fileUtils.ts b/web/src/lib/fileUtils.ts
deleted file mode 100644
index 426766317dc..00000000000
--- a/web/src/lib/fileUtils.ts
+++ /dev/null
@@ -1,4 +0,0 @@
-export function getNameFromPath(path: string) {
-  const pathParts = path.split("/");
-  return pathParts[pathParts.length - 1];
-}

From 746e5362ceee8a463cc32a0ba73f820ebd6ce9da Mon Sep 17 00:00:00 2001
From: SubashMohan <subashmohan75@gmail.com>
Date: Tue, 5 Aug 2025 01:12:07 +0530
Subject: [PATCH 52/78] fix unsupported character error in minio migration
 (#5145)

* fix unsupported character error in minio migration

* slash fix
---
 backend/onyx/file_store/s3_key_utils.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/backend/onyx/file_store/s3_key_utils.py b/backend/onyx/file_store/s3_key_utils.py
index 3f84ce59da1..39ee4e62c09 100644
--- a/backend/onyx/file_store/s3_key_utils.py
+++ b/backend/onyx/file_store/s3_key_utils.py
@@ -49,11 +49,10 @@ def sanitize_s3_key_name(file_name: str) -> str:
 
     # Characters to avoid completely (replace with underscore)
     # These are characters that AWS recommends avoiding
-    avoid_chars = r'[\\{}^%`\[\]"<>#|~]'
+    avoid_chars = r'[\\{}^%`\[\]"<>#|~/]'
 
     # Replace avoided characters with underscore
     sanitized = re.sub(avoid_chars, "_", file_name)
-
     # Characters that might require special handling but are allowed
     # We'll URL encode these to be safe
     special_chars = r"[&$@=;:+,?\s]"
@@ -81,6 +80,9 @@ def sanitize_s3_key_name(file_name: str) -> str:
     # Remove any trailing periods to avoid download issues
     sanitized = sanitized.rstrip(".")
 
+    # Remove multiple separators
+    sanitized = re.sub(r"[-_]{2,}", "-", sanitized)
+
     # If sanitization resulted in empty string, use a default
     if not sanitized:
         sanitized = "sanitized_file"

From 5c0ddc4a44ea7982314d6f11d9f599c13fa6cf97 Mon Sep 17 00:00:00 2001
From: Evan Lohn <evan@danswer.ai>
Date: Tue, 5 Aug 2025 09:44:55 -0700
Subject: [PATCH 53/78] fix: preserve error traces (#5152)


From 17299280b3f7cc35ea3cd7a72beafd819a8918ea Mon Sep 17 00:00:00 2001
From: Chris Weaver <chris@onyx.app>
Date: Tue, 5 Aug 2025 09:58:36 -0700
Subject: [PATCH 54/78] feat: add customization via env vars for curator role
 (#5150)

* Add customization via env vars for curator role

* Simplify

* Simplify more

* Address comments
---
 backend/onyx/configs/app_configs.py      | 6 ++++++
 backend/onyx/db/persona.py               | 9 +++++++++
 web/src/app/admin/add-connector/page.tsx | 1 +
 3 files changed, 16 insertions(+)

diff --git a/backend/onyx/configs/app_configs.py b/backend/onyx/configs/app_configs.py
index 5876fe409d8..a2c53752258 100644
--- a/backend/onyx/configs/app_configs.py
+++ b/backend/onyx/configs/app_configs.py
@@ -362,6 +362,12 @@
 # only very select connectors are enabled and admins cannot add other connector types
 ENABLED_CONNECTOR_TYPES = os.environ.get("ENABLED_CONNECTOR_TYPES") or ""
 
+# If set to true, curators can only access and edit assistants that they created
+CURATORS_CANNOT_VIEW_OR_EDIT_NON_OWNED_ASSISTANTS = (
+    os.environ.get("CURATORS_CANNOT_VIEW_OR_EDIT_NON_OWNED_ASSISTANTS", "").lower()
+    == "true"
+)
+
 # Some calls to get information on expert users are quite costly especially with rate limiting
 # Since experts are not used in the actual user experience, currently it is turned off
 # for some connectors
diff --git a/backend/onyx/db/persona.py b/backend/onyx/db/persona.py
index 735e793eb99..75fcb54f646 100644
--- a/backend/onyx/db/persona.py
+++ b/backend/onyx/db/persona.py
@@ -15,6 +15,7 @@
 from sqlalchemy.orm import Session
 
 from onyx.auth.schemas import UserRole
+from onyx.configs.app_configs import CURATORS_CANNOT_VIEW_OR_EDIT_NON_OWNED_ASSISTANTS
 from onyx.configs.app_configs import DISABLE_AUTH
 from onyx.configs.chat_configs import BING_API_KEY
 from onyx.configs.chat_configs import CONTEXT_CHUNKS_ABOVE
@@ -96,6 +97,14 @@ def _add_user_filters(
         where_clause = Persona.is_public == True  # noqa: E712
         return stmt.where(where_clause)
 
+    # If curator ownership restriction is enabled, curators can only access their own assistants
+    if CURATORS_CANNOT_VIEW_OR_EDIT_NON_OWNED_ASSISTANTS and user.role in [
+        UserRole.CURATOR,
+        UserRole.GLOBAL_CURATOR,
+    ]:
+        where_clause = (Persona.user_id == user.id) | (Persona.user_id.is_(None))
+        return stmt.where(where_clause)
+
     where_clause = User__UserGroup.user_id == user.id
     if user.role == UserRole.CURATOR and get_editable:
         where_clause &= User__UserGroup.is_curator == True  # noqa: E712
diff --git a/web/src/app/admin/add-connector/page.tsx b/web/src/app/admin/add-connector/page.tsx
index 588d23cd42d..0ee5fc9b08e 100644
--- a/web/src/app/admin/add-connector/page.tsx
+++ b/web/src/app/admin/add-connector/page.tsx
@@ -144,6 +144,7 @@ function SourceTileTooltipWrapper({
 
 export default function Page() {
   const sources = useMemo(() => listSourceMetadata(), []);
+
   const [searchTerm, setSearchTerm] = useState("");
   const { data: federatedConnectors } = useFederatedConnectors();
   const settings = useContext(SettingsContext);

From dd344c901be12185442eac22cf2898b54595084b Mon Sep 17 00:00:00 2001
From: Justin Tahara <105671973+justin-tahara@users.noreply.github.com>
Date: Tue, 5 Aug 2025 14:03:28 -0700
Subject: [PATCH 55/78] feat(infra): Release Charts on Github Pages (#5155)

---
 deployment/helm/charts/onyx/Chart.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deployment/helm/charts/onyx/Chart.yaml b/deployment/helm/charts/onyx/Chart.yaml
index 109113f89bd..b1679ae2ce4 100644
--- a/deployment/helm/charts/onyx/Chart.yaml
+++ b/deployment/helm/charts/onyx/Chart.yaml
@@ -5,7 +5,7 @@ home: https://www.onyx.app/
 sources:
   - "https://github.com/onyx-dot-app/onyx"
 type: application
-version: 0.2.3
+version: 0.2.4
 appVersion: latest
 annotations:
   category: Productivity

From c4d9009a8161609391223b02f277753128bde4c3 Mon Sep 17 00:00:00 2001
From: Justin Tahara <105671973+justin-tahara@users.noreply.github.com>
Date: Tue, 5 Aug 2025 14:11:57 -0700
Subject: [PATCH 56/78] fix(infra): Adding helm directory (#5156)

* feat(infra): Adding helm directory

* one more fix
---
 .github/workflows/helm-chart-releases.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/helm-chart-releases.yml b/.github/workflows/helm-chart-releases.yml
index 0489da8e268..442148f8aa4 100644
--- a/.github/workflows/helm-chart-releases.yml
+++ b/.github/workflows/helm-chart-releases.yml
@@ -36,5 +36,7 @@ jobs:
 
       - name: Run chart-releaser
         uses: helm/chart-releaser-action@v1.7.0
+        with:
+          charts_dir: deployment/helm/charts
         env:
           CR_TOKEN: "${{ secrets.GITHUB_TOKEN }}"
\ No newline at end of file

From 51724776402718a6dc931104176cf0b14f85af69 Mon Sep 17 00:00:00 2001
From: Evan Lohn <evan@danswer.ai>
Date: Tue, 5 Aug 2025 13:17:44 -0700
Subject: [PATCH 57/78] no more scheduled stalling (#5154)

---
 backend/onyx/background/celery/tasks/docprocessing/tasks.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/backend/onyx/background/celery/tasks/docprocessing/tasks.py b/backend/onyx/background/celery/tasks/docprocessing/tasks.py
index a71513a0f70..e67e45ef094 100644
--- a/backend/onyx/background/celery/tasks/docprocessing/tasks.py
+++ b/backend/onyx/background/celery/tasks/docprocessing/tasks.py
@@ -381,8 +381,9 @@ def check_indexing_completion(
             db_session, index_attempt_id, batches_processed
         )
 
-        # Check for stalls (3-6 hour timeout)
-        if timed_out:
+        # Check for stalls (3-6 hour timeout). Only applies to in-progress attempts.
+        attempt = get_index_attempt(db_session, index_attempt_id)
+        if timed_out and attempt and attempt.status == IndexingStatus.IN_PROGRESS:
             logger.error(
                 f"Indexing attempt {index_attempt_id} has been indexing for 3-6 hours without progress. "
                 f"Marking it as failed."

From 1091b89429b0076d9c8a32cd85cb67f02cc82568 Mon Sep 17 00:00:00 2001
From: Wenxi <wenxi@onyx.app>
Date: Tue, 5 Aug 2025 19:03:38 -0700
Subject: [PATCH 58/78] fix: slash command slackbot to respond in private msg
 (#5151)

* fix slash command slackbot to respond in private msg

* rename confusing variable. fix slash message response in DMs
---
 .../slack/handlers/handle_standard_answers.py |  2 +-
 backend/onyx/onyxbot/slack/blocks.py          |  8 +++----
 .../onyxbot/slack/handlers/handle_buttons.py  |  2 +-
 .../onyxbot/slack/handlers/handle_message.py  |  8 +++----
 .../slack/handlers/handle_regular_answer.py   | 23 ++++++++++---------
 backend/onyx/onyxbot/slack/listener.py        |  7 +++---
 backend/onyx/onyxbot/slack/models.py          |  4 ++--
 7 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/backend/ee/onyx/onyxbot/slack/handlers/handle_standard_answers.py b/backend/ee/onyx/onyxbot/slack/handlers/handle_standard_answers.py
index c2149f7a94a..9092c781c3c 100644
--- a/backend/ee/onyx/onyxbot/slack/handlers/handle_standard_answers.py
+++ b/backend/ee/onyx/onyxbot/slack/handlers/handle_standard_answers.py
@@ -206,7 +206,7 @@ def _handle_standard_answers(
 
         restate_question_blocks = get_restate_blocks(
             msg=query_msg.message,
-            is_bot_msg=message_info.is_bot_msg,
+            is_slash_command=message_info.is_slash_command,
         )
 
         answer_blocks = build_standard_answer_blocks(
diff --git a/backend/onyx/onyxbot/slack/blocks.py b/backend/onyx/onyxbot/slack/blocks.py
index a40696a3303..c738f47c6e2 100644
--- a/backend/onyx/onyxbot/slack/blocks.py
+++ b/backend/onyx/onyxbot/slack/blocks.py
@@ -151,7 +151,7 @@ def _build_ephemeral_publication_block(
             email=message_info.email,
             sender_id=message_info.sender_id,
             thread_messages=[],
-            is_bot_msg=message_info.is_bot_msg,
+            is_slash_command=message_info.is_slash_command,
             is_bot_dm=message_info.is_bot_dm,
             thread_to_respond=respond_ts,
         )
@@ -225,10 +225,10 @@ def _build_doc_feedback_block(
 
 def get_restate_blocks(
     msg: str,
-    is_bot_msg: bool,
+    is_slash_command: bool,
 ) -> list[Block]:
     # Only the slash command needs this context because the user doesn't see their own input
-    if not is_bot_msg:
+    if not is_slash_command:
         return []
 
     return [
@@ -576,7 +576,7 @@ def build_slack_response_blocks(
     # If called with the OnyxBot slash command, the question is lost so we have to reshow it
     if not skip_restated_question:
         restate_question_block = get_restate_blocks(
-            message_info.thread_messages[-1].message, message_info.is_bot_msg
+            message_info.thread_messages[-1].message, message_info.is_slash_command
         )
     else:
         restate_question_block = []
diff --git a/backend/onyx/onyxbot/slack/handlers/handle_buttons.py b/backend/onyx/onyxbot/slack/handlers/handle_buttons.py
index 2669f315245..c01d4c439f5 100644
--- a/backend/onyx/onyxbot/slack/handlers/handle_buttons.py
+++ b/backend/onyx/onyxbot/slack/handlers/handle_buttons.py
@@ -177,7 +177,7 @@ def handle_generate_answer_button(
                 sender_id=user_id or None,
                 email=email or None,
                 bypass_filters=True,
-                is_bot_msg=False,
+                is_slash_command=False,
                 is_bot_dm=False,
             ),
             slack_channel_config=slack_channel_config,
diff --git a/backend/onyx/onyxbot/slack/handlers/handle_message.py b/backend/onyx/onyxbot/slack/handlers/handle_message.py
index fc6bd23d60c..cad8799eef1 100644
--- a/backend/onyx/onyxbot/slack/handlers/handle_message.py
+++ b/backend/onyx/onyxbot/slack/handlers/handle_message.py
@@ -28,7 +28,7 @@
 
 
 def send_msg_ack_to_user(details: SlackMessageInfo, client: WebClient) -> None:
-    if details.is_bot_msg and details.sender_id:
+    if details.is_slash_command and details.sender_id:
         respond_in_thread_or_channel(
             client=client,
             channel=details.channel_to_respond,
@@ -124,11 +124,11 @@ def handle_message(
     messages = message_info.thread_messages
     sender_id = message_info.sender_id
     bypass_filters = message_info.bypass_filters
-    is_bot_msg = message_info.is_bot_msg
+    is_slash_command = message_info.is_slash_command
     is_bot_dm = message_info.is_bot_dm
 
     action = "slack_message"
-    if is_bot_msg:
+    if is_slash_command:
         action = "slack_slash_message"
     elif bypass_filters:
         action = "slack_tag_message"
@@ -197,7 +197,7 @@ def handle_message(
 
     # If configured to respond to team members only, then cannot be used with a /OnyxBot command
     # which would just respond to the sender
-    if send_to and is_bot_msg:
+    if send_to and is_slash_command:
         if sender_id:
             respond_in_thread_or_channel(
                 client=client,
diff --git a/backend/onyx/onyxbot/slack/handlers/handle_regular_answer.py b/backend/onyx/onyxbot/slack/handlers/handle_regular_answer.py
index be8ad4136fb..7818b70edec 100644
--- a/backend/onyx/onyxbot/slack/handlers/handle_regular_answer.py
+++ b/backend/onyx/onyxbot/slack/handlers/handle_regular_answer.py
@@ -81,15 +81,15 @@ def handle_regular_answer(
     messages = message_info.thread_messages
 
     message_ts_to_respond_to = message_info.msg_to_respond
-    is_bot_msg = message_info.is_bot_msg
+    is_slash_command = message_info.is_slash_command
 
     # Capture whether response mode for channel is ephemeral. Even if the channel is set
     # to respond with an ephemeral message, we still send as non-ephemeral if
     # the message is a dm with the Onyx bot.
     send_as_ephemeral = (
         slack_channel_config.channel_config.get("is_ephemeral", False)
-        and not message_info.is_bot_dm
-    )
+        or message_info.is_slash_command
+    ) and not message_info.is_bot_dm
 
     # If the channel mis configured to respond with an ephemeral message,
     # or the message is a dm to the Onyx bot, we should use the proper onyx user from the email.
@@ -164,7 +164,7 @@ def handle_regular_answer(
     # in an attached document set were available to all users in the channel.)
     bypass_acl = False
 
-    if not message_ts_to_respond_to and not is_bot_msg:
+    if not message_ts_to_respond_to and not is_slash_command:
         # if the message is not "/onyx" command, then it should have a message ts to respond to
         raise RuntimeError(
             "No message timestamp to respond to in `handle_message`. This should never happen."
@@ -316,13 +316,14 @@ def _get_slack_answer(
             return True
 
     # Got an answer at this point, can remove reaction and give results
-    update_emote_react(
-        emoji=DANSWER_REACT_EMOJI,
-        channel=message_info.channel_to_respond,
-        message_ts=message_info.msg_to_respond,
-        remove=True,
-        client=client,
-    )
+    if not is_slash_command:  # Slash commands don't have reactions
+        update_emote_react(
+            emoji=DANSWER_REACT_EMOJI,
+            channel=message_info.channel_to_respond,
+            message_ts=message_info.msg_to_respond,
+            remove=True,
+            client=client,
+        )
 
     if answer.answer_valid is False:
         logger.notice(
diff --git a/backend/onyx/onyxbot/slack/listener.py b/backend/onyx/onyxbot/slack/listener.py
index a20091a1442..97829bd3e96 100644
--- a/backend/onyx/onyxbot/slack/listener.py
+++ b/backend/onyx/onyxbot/slack/listener.py
@@ -876,12 +876,13 @@ def build_request_details(
             sender_id=sender_id,
             email=email,
             bypass_filters=tagged,
-            is_bot_msg=False,
+            is_slash_command=False,
             is_bot_dm=event.get("channel_type") == "im",
         )
 
     elif req.type == "slash_commands":
         channel = req.payload["channel_id"]
+        channel_name = req.payload["channel_name"]
         msg = req.payload["text"]
         sender = req.payload["user_id"]
         expert_info = expert_info_from_slack_id(
@@ -899,8 +900,8 @@ def build_request_details(
             sender_id=sender,
             email=email,
             bypass_filters=True,
-            is_bot_msg=True,
-            is_bot_dm=False,
+            is_slash_command=True,
+            is_bot_dm=channel_name == "directmessage",
         )
 
     raise RuntimeError("Programming fault, this should never happen.")
diff --git a/backend/onyx/onyxbot/slack/models.py b/backend/onyx/onyxbot/slack/models.py
index 1bba86119d3..4079a8c1124 100644
--- a/backend/onyx/onyxbot/slack/models.py
+++ b/backend/onyx/onyxbot/slack/models.py
@@ -13,7 +13,7 @@ class SlackMessageInfo(BaseModel):
     sender_id: str | None
     email: str | None
     bypass_filters: bool  # User has tagged @OnyxBot
-    is_bot_msg: bool  # User is using /OnyxBot
+    is_slash_command: bool  # User is using /OnyxBot
     is_bot_dm: bool  # User is direct messaging to OnyxBot
 
 
@@ -25,7 +25,7 @@ class ActionValuesEphemeralMessageMessageInfo(BaseModel):
     email: str | None
     sender_id: str | None
     thread_messages: list[ThreadMessage] | None
-    is_bot_msg: bool | None
+    is_slash_command: bool | None
     is_bot_dm: bool | None
     thread_to_respond: str | None
 

From c4130c9d39439b9b0945235f5c8404ae744ea585 Mon Sep 17 00:00:00 2001
From: justin-tahara <justintahara@gmail.com>
Date: Wed, 6 Aug 2025 18:53:02 -0700
Subject: [PATCH 59/78] feat(infra): Update Vespa Helm Chart Version

---
 deployment/helm/charts/onyx/Chart.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/deployment/helm/charts/onyx/Chart.yaml b/deployment/helm/charts/onyx/Chart.yaml
index b1679ae2ce4..a559d5bcaab 100644
--- a/deployment/helm/charts/onyx/Chart.yaml
+++ b/deployment/helm/charts/onyx/Chart.yaml
@@ -5,7 +5,7 @@ home: https://www.onyx.app/
 sources:
   - "https://github.com/onyx-dot-app/onyx"
 type: application
-version: 0.2.4
+version: 0.2.5
 appVersion: latest
 annotations:
   category: Productivity
@@ -23,7 +23,7 @@ dependencies:
     repository: https://charts.bitnami.com/bitnami
     condition: postgresql.enabled
   - name: vespa
-    version: 0.2.23
+    version: 0.2.24
     repository: https://onyx-dot-app.github.io/vespa-helm-charts
     condition: vespa.enabled
   - name: nginx

From b0460f42c3034951edd97ba735b78c96186733c7 Mon Sep 17 00:00:00 2001
From: justin-tahara <justintahara@gmail.com>
Date: Wed, 6 Aug 2025 18:54:21 -0700
Subject: [PATCH 60/78] Revert that change. Let's do this properly

---
 deployment/helm/charts/onyx/Chart.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/deployment/helm/charts/onyx/Chart.yaml b/deployment/helm/charts/onyx/Chart.yaml
index a559d5bcaab..b1679ae2ce4 100644
--- a/deployment/helm/charts/onyx/Chart.yaml
+++ b/deployment/helm/charts/onyx/Chart.yaml
@@ -5,7 +5,7 @@ home: https://www.onyx.app/
 sources:
   - "https://github.com/onyx-dot-app/onyx"
 type: application
-version: 0.2.5
+version: 0.2.4
 appVersion: latest
 annotations:
   category: Productivity
@@ -23,7 +23,7 @@ dependencies:
     repository: https://charts.bitnami.com/bitnami
     condition: postgresql.enabled
   - name: vespa
-    version: 0.2.24
+    version: 0.2.23
     repository: https://onyx-dot-app.github.io/vespa-helm-charts
     condition: vespa.enabled
   - name: nginx

From 9a2a843d8cbce08adfdad902c26537b975d5d77c Mon Sep 17 00:00:00 2001
From: Justin Tahara <105671973+justin-tahara@users.noreply.github.com>
Date: Wed, 6 Aug 2025 19:06:18 -0700
Subject: [PATCH 61/78] feat(infra): Bump Vespa Helm Version (#5161)

* feat(infra): Bump Vespa Helm Version

* Adding the Chart.lock file
---
 deployment/helm/charts/onyx/Chart.lock | 6 +++---
 deployment/helm/charts/onyx/Chart.yaml | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/deployment/helm/charts/onyx/Chart.lock b/deployment/helm/charts/onyx/Chart.lock
index ced81eb8383..16a04370420 100644
--- a/deployment/helm/charts/onyx/Chart.lock
+++ b/deployment/helm/charts/onyx/Chart.lock
@@ -4,7 +4,7 @@ dependencies:
   version: 14.3.1
 - name: vespa
   repository: https://onyx-dot-app.github.io/vespa-helm-charts
-  version: 0.2.23
+  version: 0.2.24
 - name: nginx
   repository: oci://registry-1.docker.io/bitnamicharts
   version: 15.14.0
@@ -14,5 +14,5 @@ dependencies:
 - name: minio
   repository: oci://registry-1.docker.io/bitnamicharts
   version: 17.0.4
-digest: sha256:4c938cf9138e4ff6f5ecac5c044324d508ef2b0e1a23ba3f2bc089015cb40ff6
-generated: "2025-06-16T18:53:19.63168-07:00"
+digest: sha256:dddd687525764f5698adc339a11d268b0ee9c3ca81f8d46c9e65a6bf2c21cf25
+generated: "2025-08-06T19:00:41.218513-07:00"
diff --git a/deployment/helm/charts/onyx/Chart.yaml b/deployment/helm/charts/onyx/Chart.yaml
index b1679ae2ce4..a559d5bcaab 100644
--- a/deployment/helm/charts/onyx/Chart.yaml
+++ b/deployment/helm/charts/onyx/Chart.yaml
@@ -5,7 +5,7 @@ home: https://www.onyx.app/
 sources:
   - "https://github.com/onyx-dot-app/onyx"
 type: application
-version: 0.2.4
+version: 0.2.5
 appVersion: latest
 annotations:
   category: Productivity
@@ -23,7 +23,7 @@ dependencies:
     repository: https://charts.bitnami.com/bitnami
     condition: postgresql.enabled
   - name: vespa
-    version: 0.2.23
+    version: 0.2.24
     repository: https://onyx-dot-app.github.io/vespa-helm-charts
     condition: vespa.enabled
   - name: nginx

From f0158ad3269ef3fc650e1f71e8a63c56b4472e77 Mon Sep 17 00:00:00 2001
From: Evan Lohn <evan@danswer.ai>
Date: Wed, 6 Aug 2025 19:37:23 -0700
Subject: [PATCH 62/78] feat: salesforce custom query (#5158)

* WIP merged approach untested

* tested custom configs

* JT comments

* fix unit test

* CW comments

* fix unit test
---
 backend/onyx/background/celery/apps/beat.py   |   5 +-
 .../onyx/connectors/salesforce/connector.py   | 231 ++++++++++++++----
 .../connectors/salesforce/doc_conversion.py   |  12 +-
 .../connectors/salesforce/onyx_salesforce.py  |  14 +-
 .../connectors/salesforce/salesforce_calls.py |  11 +-
 .../connectors/salesforce/sqlite_functions.py |  14 +-
 backend/onyx/connectors/salesforce/utils.py   |   5 +
 .../salesforce/test_salesforce_connector.py   |   3 +-
 .../test_salesforce_custom_config.py          | 129 ++++++++++
 .../salesforce/test_salesforce_sqlite.py      |  57 +++--
 web/src/components/Field.tsx                  |   7 +-
 web/src/lib/connectors/connectors.tsx         |  55 ++++-
 12 files changed, 443 insertions(+), 100 deletions(-)
 create mode 100644 backend/tests/unit/onyx/connectors/salesforce/test_salesforce_custom_config.py

diff --git a/backend/onyx/background/celery/apps/beat.py b/backend/onyx/background/celery/apps/beat.py
index 451ad56e0c5..9fbdc4d1c15 100644
--- a/backend/onyx/background/celery/apps/beat.py
+++ b/backend/onyx/background/celery/apps/beat.py
@@ -231,10 +231,7 @@ def _compare_schedules(schedule1: dict, schedule2: dict) -> bool:
         True if equivalent, False if not."""
         current_tasks = set(name for name, _ in schedule1)
         new_tasks = set(schedule2.keys())
-        if current_tasks != new_tasks:
-            return False
-
-        return True
+        return current_tasks == new_tasks
 
 
 @beat_init.connect
diff --git a/backend/onyx/connectors/salesforce/connector.py b/backend/onyx/connectors/salesforce/connector.py
index 75bd8b3f842..79df97e10e1 100644
--- a/backend/onyx/connectors/salesforce/connector.py
+++ b/backend/onyx/connectors/salesforce/connector.py
@@ -1,5 +1,6 @@
 import csv
 import gc
+import json
 import os
 import sys
 import tempfile
@@ -28,8 +29,12 @@
 from onyx.connectors.salesforce.onyx_salesforce import OnyxSalesforce
 from onyx.connectors.salesforce.salesforce_calls import fetch_all_csvs_in_parallel
 from onyx.connectors.salesforce.sqlite_functions import OnyxSalesforceSQLite
+from onyx.connectors.salesforce.utils import ACCOUNT_OBJECT_TYPE
 from onyx.connectors.salesforce.utils import BASE_DATA_PATH
 from onyx.connectors.salesforce.utils import get_sqlite_db_path
+from onyx.connectors.salesforce.utils import MODIFIED_FIELD
+from onyx.connectors.salesforce.utils import NAME_FIELD
+from onyx.connectors.salesforce.utils import USER_OBJECT_TYPE
 from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
 from onyx.utils.logger import setup_logger
 from shared_configs.configs import MULTI_TENANT
@@ -38,27 +43,27 @@
 logger = setup_logger()
 
 
-_DEFAULT_PARENT_OBJECT_TYPES = ["Account"]
+_DEFAULT_PARENT_OBJECT_TYPES = [ACCOUNT_OBJECT_TYPE]
 
 _DEFAULT_ATTRIBUTES_TO_KEEP: dict[str, dict[str, str]] = {
     "Opportunity": {
-        "Account": "account",
+        ACCOUNT_OBJECT_TYPE: "account",
         "FiscalQuarter": "fiscal_quarter",
         "FiscalYear": "fiscal_year",
         "IsClosed": "is_closed",
-        "Name": "name",
+        NAME_FIELD: "name",
         "StageName": "stage_name",
         "Type": "type",
         "Amount": "amount",
         "CloseDate": "close_date",
         "Probability": "probability",
         "CreatedDate": "created_date",
-        "LastModifiedDate": "last_modified_date",
+        MODIFIED_FIELD: "last_modified_date",
     },
     "Contact": {
-        "Account": "account",
+        ACCOUNT_OBJECT_TYPE: "account",
         "CreatedDate": "created_date",
-        "LastModifiedDate": "last_modified_date",
+        MODIFIED_FIELD: "last_modified_date",
     },
 }
 
@@ -74,19 +79,77 @@ class SalesforceConnectorContext:
     parent_to_child_types: dict[str, set[str]] = {}  # map from parent to child types
     child_to_parent_types: dict[str, set[str]] = {}  # map from child to parent types
     parent_reference_fields_by_type: dict[str, dict[str, list[str]]] = {}
-    type_to_queryable_fields: dict[str, list[str]] = {}
+    type_to_queryable_fields: dict[str, set[str]] = {}
     prefix_to_type: dict[str, str] = {}  # infer the object type of an id immediately
 
     parent_to_child_relationships: dict[str, set[str]] = (
         {}
     )  # map from parent to child relationships
-    parent_to_relationship_queryable_fields: dict[str, dict[str, list[str]]] = (
+    parent_to_relationship_queryable_fields: dict[str, dict[str, set[str]]] = (
         {}
     )  # map from relationship to queryable fields
 
     parent_child_names_to_relationships: dict[str, str] = {}
 
 
+def _extract_fields_and_associations_from_config(
+    config: dict[str, Any], object_type: str
+) -> tuple[list[str] | None, dict[str, list[str]]]:
+    """
+    Extract fields and associations for a specific object type from custom config.
+
+    Returns:
+        tuple of (fields_list, associations_dict)
+        - fields_list: List of fields to query, or None if not specified (use all)
+        - associations_dict: Dict mapping association names to their config
+    """
+    if object_type not in config:
+        return None, {}
+
+    obj_config = config[object_type]
+    fields = obj_config.get("fields")
+    associations = obj_config.get("associations", {})
+
+    return fields, associations
+
+
+def _validate_custom_query_config(config: dict[str, Any]) -> None:
+    """
+    Validate the structure of the custom query configuration.
+    """
+
+    for object_type, obj_config in config.items():
+        if not isinstance(obj_config, dict):
+            raise ValueError(
+                f"top level object {object_type} must be mapped to a dictionary"
+            )
+
+        # Check if fields is a list when present
+        if "fields" in obj_config:
+            if not isinstance(obj_config["fields"], list):
+                raise ValueError("if fields key exists, value must be a list")
+            for v in obj_config["fields"]:
+                if not isinstance(v, str):
+                    raise ValueError(f"if fields list value {v} is not a string")
+
+        # Check if associations is a dict when present
+        if "associations" in obj_config:
+            if not isinstance(obj_config["associations"], dict):
+                raise ValueError(
+                    "if associations key exists, value must be a dictionary"
+                )
+            for assoc_name, assoc_fields in obj_config["associations"].items():
+                if not isinstance(assoc_fields, list):
+                    raise ValueError(
+                        f"associations list value {assoc_fields} for key {assoc_name} is not a list"
+                    )
+                for v in assoc_fields:
+                    if not isinstance(v, str):
+                        raise ValueError(
+                            f"if associations list value {v} is not a string"
+                        )
+
+
 class SalesforceConnector(LoadConnector, PollConnector, SlimConnector):
     """Approach outline
 
@@ -134,14 +197,25 @@ def __init__(
         self,
         batch_size: int = INDEX_BATCH_SIZE,
         requested_objects: list[str] = [],
+        custom_query_config: str | None = None,
     ) -> None:
         self.batch_size = batch_size
         self._sf_client: OnyxSalesforce | None = None
-        self.parent_object_list = (
-            [obj.capitalize() for obj in requested_objects]
-            if requested_objects
-            else _DEFAULT_PARENT_OBJECT_TYPES
-        )
+
+        # Validate and store custom query config
+        if custom_query_config:
+            config_json = json.loads(custom_query_config)
+            self.custom_query_config: dict[str, Any] | None = config_json
+            # If custom query config is provided, use the object types from it
+            self.parent_object_list = list(config_json.keys())
+        else:
+            self.custom_query_config = None
+            # Use the traditional requested_objects approach
+            self.parent_object_list = (
+                [obj.strip().capitalize() for obj in requested_objects]
+                if requested_objects
+                else _DEFAULT_PARENT_OBJECT_TYPES
+            )
 
     def load_credentials(
         self,
@@ -187,7 +261,7 @@ def reconstruct_object_types(directory: str) -> dict[str, list[str] | None]:
     @staticmethod
     def _download_object_csvs(
         all_types_to_filter: dict[str, bool],
-        queryable_fields_by_type: dict[str, list[str]],
+        queryable_fields_by_type: dict[str, set[str]],
         directory: str,
         sf_client: OnyxSalesforce,
         start: SecondsSinceUnixEpoch | None = None,
@@ -325,9 +399,9 @@ def _load_csvs_to_db(
     #         all_types.update(child_types.keys())
 
     #     # Always want to make sure user is grabbed for permissioning purposes
-    #     all_types.add("User")
+    #     all_types.add(USER_OBJECT_TYPE)
     #     # Always want to make sure account is grabbed for reference purposes
-    #     all_types.add("Account")
+    #     all_types.add(ACCOUNT_OBJECT_TYPE)
 
     #     logger.info(f"All object types: num={len(all_types)} list={all_types}")
 
@@ -351,7 +425,7 @@ def _load_csvs_to_db(
     #         all_types.update(child_types)
 
     #     # Always want to make sure user is grabbed for permissioning purposes
-    #     all_types.add("User")
+    #     all_types.add(USER_OBJECT_TYPE)
 
     #     logger.info(f"All object types: num={len(all_types)} list={all_types}")
 
@@ -677,7 +751,9 @@ def _delta_sync(
                     try:
                         last_modified_by_id = record["LastModifiedById"]
                         user_record = self.sf_client.query_object(
-                            "User", last_modified_by_id, ctx.type_to_queryable_fields
+                            USER_OBJECT_TYPE,
+                            last_modified_by_id,
+                            ctx.type_to_queryable_fields,
                         )
                         if user_record:
                             primary_owner = BasicExpertInfo.from_dict(user_record)
@@ -792,7 +868,7 @@ def _make_context(
         parent_reference_fields_by_type: dict[str, dict[str, list[str]]] = (
             {}
         )  # for a given object, the fields reference parent objects
-        type_to_queryable_fields: dict[str, list[str]] = {}
+        type_to_queryable_fields: dict[str, set[str]] = {}
         prefix_to_type: dict[str, str] = {}
 
         parent_to_child_relationships: dict[str, set[str]] = (
@@ -802,15 +878,13 @@ def _make_context(
         # relationship keys are formatted as "parent__relationship"
         # we have to do this because relationship names are not unique!
         # values are a dict of relationship names to a list of queryable fields
-        parent_to_relationship_queryable_fields: dict[str, dict[str, list[str]]] = {}
+        parent_to_relationship_queryable_fields: dict[str, dict[str, set[str]]] = {}
 
         parent_child_names_to_relationships: dict[str, str] = {}
 
-        full_sync = False
-        if start is None and end is None:
-            full_sync = True
+        full_sync = start is None and end is None
 
-        # Step 1 - make a list of all the types to download (parent + direct child + "User")
+        # Step 1 - make a list of all the types to download (parent + direct child + USER_OBJECT_TYPE)
         # prefixes = {}
 
         global_description = sf_client.describe()
@@ -831,16 +905,47 @@ def _make_context(
         logger.info(f"Parent object types: num={len(parent_types)} list={parent_types}")
         for parent_type in parent_types:
             # parent_onyx_sf_type = OnyxSalesforceType(parent_type, sf_client)
-            type_to_queryable_fields[parent_type] = (
-                sf_client.get_queryable_fields_by_type(parent_type)
-            )
 
-            child_types_working = sf_client.get_children_of_sf_type(parent_type)
-            logger.debug(
-                f"Found {len(child_types_working)} child types for {parent_type}"
-            )
+            custom_fields: list[str] | None = []
+            associations_config: dict[str, list[str]] | None = None
+
+            # Set queryable fields for parent type
+            if self.custom_query_config:
+                custom_fields, associations_config = (
+                    _extract_fields_and_associations_from_config(
+                        self.custom_query_config, parent_type
+                    )
+                )
+                custom_fields = custom_fields or []
+
+                # Get custom fields for parent type
+                field_set = set(custom_fields)
+                # these are expected and used during doc conversion
+                field_set.add(NAME_FIELD)
+                field_set.add(MODIFIED_FIELD)
+
+                # Use only the specified fields
+                type_to_queryable_fields[parent_type] = field_set
+                logger.info(f"Using custom fields for {parent_type}: {field_set}")
+            else:
+                # Use all queryable fields
+                type_to_queryable_fields[parent_type] = (
+                    sf_client.get_queryable_fields_by_type(parent_type)
+                )
+                logger.info(f"Using all fields for {parent_type}")
+
+            child_types_all = sf_client.get_children_of_sf_type(parent_type)
+            logger.debug(f"Found {len(child_types_all)} child types for {parent_type}")
 
-            # parent_to_child_relationships[parent_type] = child_types_working
+            child_types_working = child_types_all.copy()
+            if associations_config is not None:
+                child_types_working = {
+                    k: v for k, v in child_types_all.items() if k in associations_config
+                }
+
+            parent_to_child_relationships[parent_type] = set()
+            parent_to_child_types[parent_type] = set()
+            parent_to_relationship_queryable_fields[parent_type] = {}
 
             for child_type, child_relationship in child_types_working.items():
                 child_type = cast(str, child_type)
@@ -848,8 +953,6 @@ def _make_context(
                 # onyx_sf_type = OnyxSalesforceType(child_type, sf_client)
 
                 # map parent name to child name
-                if parent_type not in parent_to_child_types:
-                    parent_to_child_types[parent_type] = set()
                 parent_to_child_types[parent_type].add(child_type)
 
                 # reverse map child name to parent name
@@ -858,19 +961,25 @@ def _make_context(
                 child_to_parent_types[child_type].add(parent_type)
 
                 # map parent name to child relationship
-                if parent_type not in parent_to_child_relationships:
-                    parent_to_child_relationships[parent_type] = set()
                 parent_to_child_relationships[parent_type].add(child_relationship)
 
                 # map relationship to queryable fields of the target table
-                queryable_fields = sf_client.get_queryable_fields_by_type(child_type)
+                if config_fields := (
+                    associations_config and associations_config.get(child_type)
+                ):
+                    field_set = set(config_fields)
+                    # these are expected and used during doc conversion
+                    field_set.add(NAME_FIELD)
+                    field_set.add(MODIFIED_FIELD)
+                    queryable_fields = field_set
+                else:
+                    queryable_fields = sf_client.get_queryable_fields_by_type(
+                        child_type
+                    )
 
                 if child_relationship in parent_to_relationship_queryable_fields:
                     raise RuntimeError(f"{child_relationship=} already exists")
 
-                if parent_type not in parent_to_relationship_queryable_fields:
-                    parent_to_relationship_queryable_fields[parent_type] = {}
-
                 parent_to_relationship_queryable_fields[parent_type][
                     child_relationship
                 ] = queryable_fields
@@ -894,14 +1003,22 @@ def _make_context(
         all_types.update(child_types)
 
         # NOTE(rkuo): should this be an implicit parent type?
-        all_types.add("User")  # Always add User for permissioning purposes
-        all_types.add("Account")  # Always add Account for reference purposes
+        all_types.add(USER_OBJECT_TYPE)  # Always add User for permissioning purposes
+        all_types.add(ACCOUNT_OBJECT_TYPE)  # Always add Account for reference purposes
 
         logger.info(f"All object types: num={len(all_types)} list={all_types}")
 
+        # Ensure User and Account have queryable fields if they weren't already processed
+        essential_types = [USER_OBJECT_TYPE, ACCOUNT_OBJECT_TYPE]
+        for essential_type in essential_types:
+            if essential_type not in type_to_queryable_fields:
+                type_to_queryable_fields[essential_type] = (
+                    sf_client.get_queryable_fields_by_type(essential_type)
+                )
+
         # 1.1 - Detect all fields in child types which reference a parent type.
         # build dicts to detect relationships between parent and child
-        for child_type in child_types:
+        for child_type in child_types.union(essential_types):
             # onyx_sf_type = OnyxSalesforceType(child_type, sf_client)
             parent_reference_fields = sf_client.get_parent_reference_fields(
                 child_type, parent_types
@@ -1003,6 +1120,32 @@ def retrieve_all_slim_documents(
 
         yield doc_metadata_list
 
+    def validate_connector_settings(self) -> None:
+        """
+        Validate that the Salesforce credentials and connector settings are correct.
+        Specifically checks that we can make an authenticated request to Salesforce.
+        """
+
+        try:
+            # Attempt to fetch a small batch of objects (arbitrary endpoint) to verify credentials
+            self.sf_client.describe()
+        except Exception as e:
+            raise ConnectorMissingCredentialError(
+                "Failed to validate Salesforce credentials. Please check your"
+                f"credentials and try again. Error: {e}"
+            )
+
+        if self.custom_query_config:
+            try:
+                _validate_custom_query_config(self.custom_query_config)
+            except Exception as e:
+                raise ConnectorMissingCredentialError(
+                    "Failed to validate Salesforce custom query config. Please check your"
+                    f"config and try again. Error: {e}"
+                )
+
+        logger.info("Salesforce credentials validated successfully.")
+
     # @override
     # def load_from_checkpoint(
     #     self,
@@ -1032,7 +1175,7 @@ def retrieve_all_slim_documents(
 
 
 if __name__ == "__main__":
-    connector = SalesforceConnector(requested_objects=["Account"])
+    connector = SalesforceConnector(requested_objects=[ACCOUNT_OBJECT_TYPE])
 
     connector.load_credentials(
         {
diff --git a/backend/onyx/connectors/salesforce/doc_conversion.py b/backend/onyx/connectors/salesforce/doc_conversion.py
index d1eb2ce7ebc..cc650e65191 100644
--- a/backend/onyx/connectors/salesforce/doc_conversion.py
+++ b/backend/onyx/connectors/salesforce/doc_conversion.py
@@ -10,6 +10,8 @@
 from onyx.connectors.models import TextSection
 from onyx.connectors.salesforce.onyx_salesforce import OnyxSalesforce
 from onyx.connectors.salesforce.sqlite_functions import OnyxSalesforceSQLite
+from onyx.connectors.salesforce.utils import MODIFIED_FIELD
+from onyx.connectors.salesforce.utils import NAME_FIELD
 from onyx.connectors.salesforce.utils import SalesforceObject
 from onyx.utils.logger import setup_logger
 
@@ -140,7 +142,7 @@ def _extract_primary_owner(
         first_name=user_data.get("FirstName"),
         last_name=user_data.get("LastName"),
         email=user_data.get("Email"),
-        display_name=user_data.get("Name"),
+        display_name=user_data.get(NAME_FIELD),
     )
 
     # Check if all fields are None
@@ -166,8 +168,8 @@ def convert_sf_query_result_to_doc(
     """Generates a yieldable Document from query results"""
 
     base_url = f"https://{sf_client.sf_instance}"
-    extracted_doc_updated_at = time_str_to_utc(record["LastModifiedDate"])
-    extracted_semantic_identifier = record.get("Name", "Unknown Object")
+    extracted_doc_updated_at = time_str_to_utc(record[MODIFIED_FIELD])
+    extracted_semantic_identifier = record.get(NAME_FIELD, "Unknown Object")
 
     sections = [_extract_section(record, f"{base_url}/{record_id}")]
     for child_record_key, child_record in child_records.items():
@@ -205,8 +207,8 @@ def convert_sf_object_to_doc(
     salesforce_id = object_dict["Id"]
     onyx_salesforce_id = f"{ID_PREFIX}{salesforce_id}"
     base_url = f"https://{sf_instance}"
-    extracted_doc_updated_at = time_str_to_utc(object_dict["LastModifiedDate"])
-    extracted_semantic_identifier = object_dict.get("Name", "Unknown Object")
+    extracted_doc_updated_at = time_str_to_utc(object_dict[MODIFIED_FIELD])
+    extracted_semantic_identifier = object_dict.get(NAME_FIELD, "Unknown Object")
 
     sections = [_extract_section(sf_object.data, f"{base_url}/{sf_object.id}")]
     for id in sf_db.get_child_ids(sf_object.id):
diff --git a/backend/onyx/connectors/salesforce/onyx_salesforce.py b/backend/onyx/connectors/salesforce/onyx_salesforce.py
index f0ae30f0083..46ee0f9db85 100644
--- a/backend/onyx/connectors/salesforce/onyx_salesforce.py
+++ b/backend/onyx/connectors/salesforce/onyx_salesforce.py
@@ -60,7 +60,7 @@ def is_blacklisted(self, object_type: str) -> bool:
                 return True
 
         for suffix in SALESFORCE_BLACKLISTED_SUFFIXES:
-            if object_type_lower.endswith(prefix):
+            if object_type_lower.endswith(suffix):
                 return True
 
         return False
@@ -112,7 +112,7 @@ def _make_child_objects_by_id_query(
         object_id: str,
         sf_type: str,
         child_relationships: list[str],
-        relationships_to_fields: dict[str, list[str]],
+        relationships_to_fields: dict[str, set[str]],
     ) -> str:
         """Returns a SOQL query given the object id, type and child relationships.
 
@@ -148,7 +148,7 @@ def query_object(
         self,
         object_type: str,
         object_id: str,
-        type_to_queryable_fields: dict[str, list[str]],
+        type_to_queryable_fields: dict[str, set[str]],
     ) -> dict[str, Any] | None:
         record: dict[str, Any] = {}
 
@@ -172,7 +172,7 @@ def get_child_objects_by_id(
         object_id: str,
         sf_type: str,
         child_relationships: list[str],
-        relationships_to_fields: dict[str, list[str]],
+        relationships_to_fields: dict[str, set[str]],
     ) -> dict[str, dict[str, Any]]:
         """There's a limit on the number of subqueries we can put in a single query."""
         child_records: dict[str, dict[str, Any]] = {}
@@ -264,10 +264,10 @@ def describe_type(self, name: str) -> Any:
                 time.sleep(3)
             raise
 
-    def get_queryable_fields_by_type(self, name: str) -> list[str]:
+    def get_queryable_fields_by_type(self, name: str) -> set[str]:
         object_description = self.describe_type(name)
         if object_description is None:
-            return []
+            return set()
 
         fields: list[dict[str, Any]] = object_description["fields"]
         valid_fields: set[str] = set()
@@ -286,7 +286,7 @@ def get_queryable_fields_by_type(self, name: str) -> list[str]:
             if field_name:
                 valid_fields.add(field_name)
 
-        return list(valid_fields - field_names_to_remove)
+        return valid_fields - field_names_to_remove
 
     def get_children_of_sf_type(self, sf_type: str) -> dict[str, str]:
         """Returns a dict of child object names to relationship names.
diff --git a/backend/onyx/connectors/salesforce/salesforce_calls.py b/backend/onyx/connectors/salesforce/salesforce_calls.py
index c58fec3a45d..1735699e407 100644
--- a/backend/onyx/connectors/salesforce/salesforce_calls.py
+++ b/backend/onyx/connectors/salesforce/salesforce_calls.py
@@ -14,6 +14,7 @@
     rate_limit_builder,
 )
 from onyx.connectors.interfaces import SecondsSinceUnixEpoch
+from onyx.connectors.salesforce.utils import MODIFIED_FIELD
 from onyx.utils.logger import setup_logger
 from onyx.utils.retry_wrapper import retry_builder
 
@@ -54,12 +55,12 @@ def _build_created_date_time_filter_for_salesforce(
 
 
 def _make_time_filter_for_sf_type(
-    queryable_fields: list[str],
+    queryable_fields: set[str],
     start: SecondsSinceUnixEpoch,
     end: SecondsSinceUnixEpoch,
 ) -> str | None:
 
-    if "LastModifiedDate" in queryable_fields:
+    if MODIFIED_FIELD in queryable_fields:
         return _build_last_modified_time_filter_for_salesforce(start, end)
 
     if "CreatedDate" in queryable_fields:
@@ -69,14 +70,14 @@ def _make_time_filter_for_sf_type(
 
 
 def _make_time_filtered_query(
-    queryable_fields: list[str], sf_type: str, time_filter: str
+    queryable_fields: set[str], sf_type: str, time_filter: str
 ) -> str:
     query = f"SELECT {', '.join(queryable_fields)} FROM {sf_type}{time_filter}"
     return query
 
 
 def get_object_by_id_query(
-    object_id: str, sf_type: str, queryable_fields: list[str]
+    object_id: str, sf_type: str, queryable_fields: set[str]
 ) -> str:
     query = (
         f"SELECT {', '.join(queryable_fields)} FROM {sf_type} WHERE Id = '{object_id}'"
@@ -193,7 +194,7 @@ def _bulk_retrieve_from_salesforce(
 def fetch_all_csvs_in_parallel(
     sf_client: Salesforce,
     all_types_to_filter: dict[str, bool],
-    queryable_fields_by_type: dict[str, list[str]],
+    queryable_fields_by_type: dict[str, set[str]],
     start: SecondsSinceUnixEpoch | None,
     end: SecondsSinceUnixEpoch | None,
     target_dir: str,
diff --git a/backend/onyx/connectors/salesforce/sqlite_functions.py b/backend/onyx/connectors/salesforce/sqlite_functions.py
index 57b256ee8f2..63aa5d826b9 100644
--- a/backend/onyx/connectors/salesforce/sqlite_functions.py
+++ b/backend/onyx/connectors/salesforce/sqlite_functions.py
@@ -8,11 +8,15 @@
 from typing import Any
 
 from onyx.connectors.models import BasicExpertInfo
+from onyx.connectors.salesforce.utils import ACCOUNT_OBJECT_TYPE
+from onyx.connectors.salesforce.utils import NAME_FIELD
 from onyx.connectors.salesforce.utils import SalesforceObject
+from onyx.connectors.salesforce.utils import USER_OBJECT_TYPE
 from onyx.connectors.salesforce.utils import validate_salesforce_id
 from onyx.utils.logger import setup_logger
 from shared_configs.utils import batch_list
 
+
 logger = setup_logger()
 
 
@@ -567,7 +571,7 @@ def update_from_csv(
                         uncommitted_rows = 0
 
             # If we're updating User objects, update the email map
-            if object_type == "User":
+            if object_type == USER_OBJECT_TYPE:
                 OnyxSalesforceSQLite._update_user_email_map(cursor)
 
         return updated_ids
@@ -619,7 +623,7 @@ def get_record(
         with self._conn:
             cursor = self._conn.cursor()
             # Get the object data and account data
-            if object_type == "Account" or isChild:
+            if object_type == ACCOUNT_OBJECT_TYPE or isChild:
                 cursor.execute(
                     "SELECT data FROM salesforce_objects WHERE id = ?", (object_id,)
                 )
@@ -638,7 +642,7 @@ def get_record(
 
             data = json.loads(result[0][0])
 
-            if object_type != "Account":
+            if object_type != ACCOUNT_OBJECT_TYPE:
 
                 # convert any account ids of the relationships back into data fields, with name
                 for row in result:
@@ -647,14 +651,14 @@ def get_record(
                     if len(row) < 3:
                         continue
 
-                    if row[1] and row[2] and row[2] == "Account":
+                    if row[1] and row[2] and row[2] == ACCOUNT_OBJECT_TYPE:
                         data["AccountId"] = row[1]
                         cursor.execute(
                             "SELECT data FROM salesforce_objects WHERE id = ?",
                             (row[1],),
                         )
                         account_data = json.loads(cursor.fetchone()[0])
-                        data["Account"] = account_data.get("Name", "")
+                        data[ACCOUNT_OBJECT_TYPE] = account_data.get(NAME_FIELD, "")
 
             return SalesforceObject(id=object_id, type=object_type, data=data)
 
diff --git a/backend/onyx/connectors/salesforce/utils.py b/backend/onyx/connectors/salesforce/utils.py
index c544d1807c6..b7bfd30a1aa 100644
--- a/backend/onyx/connectors/salesforce/utils.py
+++ b/backend/onyx/connectors/salesforce/utils.py
@@ -2,6 +2,11 @@
 from dataclasses import dataclass
 from typing import Any
 
+NAME_FIELD = "Name"
+MODIFIED_FIELD = "LastModifiedDate"
+ACCOUNT_OBJECT_TYPE = "Account"
+USER_OBJECT_TYPE = "User"
+
 
 @dataclass
 class SalesforceObject:
diff --git a/backend/tests/daily/connectors/salesforce/test_salesforce_connector.py b/backend/tests/daily/connectors/salesforce/test_salesforce_connector.py
index 3bed5aeb41c..a8a882ecaea 100644
--- a/backend/tests/daily/connectors/salesforce/test_salesforce_connector.py
+++ b/backend/tests/daily/connectors/salesforce/test_salesforce_connector.py
@@ -11,6 +11,7 @@
 from onyx.configs.constants import DocumentSource
 from onyx.connectors.models import Document
 from onyx.connectors.salesforce.connector import SalesforceConnector
+from onyx.connectors.salesforce.utils import ACCOUNT_OBJECT_TYPE
 
 
 def extract_key_value_pairs_to_set(
@@ -35,7 +36,7 @@ def _load_reference_data(
 @pytest.fixture
 def salesforce_connector() -> SalesforceConnector:
     connector = SalesforceConnector(
-        requested_objects=["Account", "Contact", "Opportunity"],
+        requested_objects=[ACCOUNT_OBJECT_TYPE, "Contact", "Opportunity"],
     )
 
     username = os.environ["SF_USERNAME"]
diff --git a/backend/tests/unit/onyx/connectors/salesforce/test_salesforce_custom_config.py b/backend/tests/unit/onyx/connectors/salesforce/test_salesforce_custom_config.py
new file mode 100644
index 00000000000..4e4c2483cee
--- /dev/null
+++ b/backend/tests/unit/onyx/connectors/salesforce/test_salesforce_custom_config.py
@@ -0,0 +1,129 @@
+#!/usr/bin/env python3
+"""
+Test script for the new custom query configuration functionality in SalesforceConnector.
+
+This demonstrates how to use the new custom_query_config parameter to specify
+exactly which fields and associations (child objects) to retrieve for each object type.
+"""
+
+import json
+from typing import Any
+
+from onyx.connectors.salesforce.connector import _validate_custom_query_config
+from onyx.connectors.salesforce.connector import SalesforceConnector
+from onyx.connectors.salesforce.utils import ACCOUNT_OBJECT_TYPE
+from onyx.connectors.salesforce.utils import MODIFIED_FIELD
+
+
+def test_custom_query_config() -> None:
+    """Test the custom query configuration functionality."""
+
+    # Example custom query configuration
+    # This specifies exactly which fields and associations to retrieve
+    custom_config = {
+        ACCOUNT_OBJECT_TYPE: {
+            "fields": ["Id", "Name", "Industry", "CreatedDate", MODIFIED_FIELD],
+            "associations": {
+                "Contact": ["Id", "FirstName", "LastName", "Email"],
+                "Opportunity": ["Id", "Name", "StageName", "Amount", "CloseDate"],
+            },
+        },
+        "Lead": {
+            "fields": ["Id", "FirstName", "LastName", "Company", "Status"],
+            "associations": {},  # No associations for Lead
+        },
+    }
+
+    # Create connector with custom configuration
+    connector = SalesforceConnector(
+        batch_size=50, custom_query_config=json.dumps(custom_config)
+    )
+
+    print("✅ SalesforceConnector created successfully with custom query config")
+    print(f"Parent object list: {connector.parent_object_list}")
+    print(f"Custom config keys: {list(custom_config.keys())}")
+
+    # Test that the parent object list is derived from the custom config
+    assert connector.parent_object_list == [ACCOUNT_OBJECT_TYPE, "Lead"]
+    assert connector.custom_query_config == custom_config
+
+    print("✅ Basic validation passed")
+
+
+def test_traditional_config() -> None:
+    """Test that the traditional requested_objects approach still works."""
+
+    # Traditional approach
+    connector = SalesforceConnector(
+        batch_size=50, requested_objects=[ACCOUNT_OBJECT_TYPE, "Contact"]
+    )
+
+    print("✅ SalesforceConnector created successfully with traditional config")
+    print(f"Parent object list: {connector.parent_object_list}")
+
+    # Test that it still works the old way
+    assert connector.parent_object_list == [ACCOUNT_OBJECT_TYPE, "Contact"]
+    assert connector.custom_query_config is None
+
+    print("✅ Traditional config validation passed")
+
+
+def test_validation() -> None:
+    """Test that invalid configurations are rejected."""
+
+    # Test invalid config structure
+    invalid_configs: list[Any] = [
+        # Invalid fields type
+        {ACCOUNT_OBJECT_TYPE: {"fields": "invalid"}},
+        # Invalid associations type
+        {ACCOUNT_OBJECT_TYPE: {"associations": "invalid"}},
+        # Nested invalid structure
+        {ACCOUNT_OBJECT_TYPE: {"associations": {"Contact": {"fields": "invalid"}}}},
+    ]
+
+    for i, invalid_config in enumerate(invalid_configs):
+        try:
+            _validate_custom_query_config(invalid_config)
+            assert False, f"Should have raised ValueError for invalid_config[{i}]"
+        except ValueError:
+            print(f"✅ Correctly rejected invalid config {i}")
+
+
+if __name__ == "__main__":
+    print("Testing SalesforceConnector custom query configuration...")
+    print("=" * 60)
+
+    test_custom_query_config()
+    print()
+
+    test_traditional_config()
+    print()
+
+    test_validation()
+    print()
+
+    print("=" * 60)
+    print("🎉 All tests passed! The custom query configuration is working correctly.")
+    print()
+    print("Example usage:")
+    print(
+        """
+# Custom configuration approach
+custom_config = {
+    ACCOUNT_OBJECT_TYPE: {
+        "fields": ["Id", "Name", "Industry"],
+        "associations": {
+            "Contact": {
+                "fields": ["Id", "FirstName", "LastName", "Email"],
+                "associations": {}
+            }
+        }
+    }
+}
+
+connector = SalesforceConnector(custom_query_config=custom_config)
+
+# Traditional approach (still works)
+connector = SalesforceConnector(requested_objects=[ACCOUNT_OBJECT_TYPE, "Contact"])
+"""
+    )
diff --git a/backend/tests/unit/onyx/connectors/salesforce/test_salesforce_sqlite.py b/backend/tests/unit/onyx/connectors/salesforce/test_salesforce_sqlite.py
index 2242c6e52a5..e4fcf5af6eb 100644
--- a/backend/tests/unit/onyx/connectors/salesforce/test_salesforce_sqlite.py
+++ b/backend/tests/unit/onyx/connectors/salesforce/test_salesforce_sqlite.py
@@ -26,6 +26,9 @@
 from onyx.connectors.salesforce.salesforce_calls import _make_time_filtered_query
 from onyx.connectors.salesforce.salesforce_calls import get_object_by_id_query
 from onyx.connectors.salesforce.sqlite_functions import OnyxSalesforceSQLite
+from onyx.connectors.salesforce.utils import ACCOUNT_OBJECT_TYPE
+from onyx.connectors.salesforce.utils import MODIFIED_FIELD
+from onyx.connectors.salesforce.utils import USER_OBJECT_TYPE
 from onyx.utils.logger import setup_logger
 
 # from onyx.connectors.salesforce.onyx_salesforce_type import OnyxSalesforceType
@@ -153,7 +156,7 @@ def _create_csv_file_and_update_db(
     Creates a CSV file for the given object type and records.
 
     Args:
-        object_type: The Salesforce object type (e.g. "Account", "Contact")
+        object_type: The Salesforce object type (e.g. ACCOUNT_OBJECT_TYPE, "Contact")
         records: List of dictionaries containing the record data
         filename: Name of the CSV file to create (default: test_data.csv)
     """
@@ -184,7 +187,7 @@ def _create_csv_with_example_data(sf_db: OnyxSalesforceSQLite) -> None:
     Creates CSV files with example data, organized by object type.
     """
     example_data: dict[str, list[dict]] = {
-        "Account": [
+        ACCOUNT_OBJECT_TYPE: [
             {
                 "Id": _VALID_SALESFORCE_IDS[0],
                 "Name": "Acme Inc.",
@@ -428,7 +431,7 @@ def _test_query(sf_db: OnyxSalesforceSQLite) -> None:
     }
 
     # Get all Account IDs
-    account_ids = sf_db.find_ids_by_type("Account")
+    account_ids = sf_db.find_ids_by_type(ACCOUNT_OBJECT_TYPE)
 
     # Verify we found all expected accounts
     assert len(account_ids) == len(
@@ -480,7 +483,9 @@ def _test_upsert(sf_db: OnyxSalesforceSQLite) -> None:
         },
     ]
 
-    _create_csv_file_and_update_db(sf_db, "Account", update_data, "update_data.csv")
+    _create_csv_file_and_update_db(
+        sf_db, ACCOUNT_OBJECT_TYPE, update_data, "update_data.csv"
+    )
 
     # Verify the update worked
     updated_record = sf_db.get_record(_VALID_SALESFORCE_IDS[0])
@@ -573,7 +578,7 @@ def _test_account_with_children(sf_db: OnyxSalesforceSQLite) -> None:
     3. Child object data is complete and accurate
     """
     # First get all account IDs
-    account_ids = sf_db.find_ids_by_type("Account")
+    account_ids = sf_db.find_ids_by_type(ACCOUNT_OBJECT_TYPE)
     assert len(account_ids) > 0, "No accounts found"
 
     # For each account, get its children and verify the data
@@ -690,7 +695,7 @@ def _test_get_affected_parent_ids(sf_db: OnyxSalesforceSQLite) -> None:
     """
     # Create test data with relationships
     test_data = {
-        "Account": [
+        ACCOUNT_OBJECT_TYPE: [
             {
                 "Id": _VALID_SALESFORCE_IDS[0],
                 "Name": "Parent Account 1",
@@ -720,40 +725,46 @@ def _test_get_affected_parent_ids(sf_db: OnyxSalesforceSQLite) -> None:
 
     # Test Case 1: Account directly in updated_ids and parent_types
     updated_ids = [_VALID_SALESFORCE_IDS[1]]  # Parent Account 2
-    parent_types = set(["Account"])
+    parent_types = set([ACCOUNT_OBJECT_TYPE])
     affected_ids_by_type = defaultdict(set)
     for parent_type, parent_id, _ in sf_db.get_changed_parent_ids_by_type(
         updated_ids, parent_types
     ):
         affected_ids_by_type[parent_type].add(parent_id)
-    assert "Account" in affected_ids_by_type, "Account type not in affected_ids_by_type"
     assert (
-        _VALID_SALESFORCE_IDS[1] in affected_ids_by_type["Account"]
+        ACCOUNT_OBJECT_TYPE in affected_ids_by_type
+    ), "Account type not in affected_ids_by_type"
+    assert (
+        _VALID_SALESFORCE_IDS[1] in affected_ids_by_type[ACCOUNT_OBJECT_TYPE]
     ), "Direct parent ID not included"
 
     # Test Case 2: Account with child in updated_ids
     updated_ids = [_VALID_SALESFORCE_IDS[40]]  # Child Contact
-    parent_types = set(["Account"])
+    parent_types = set([ACCOUNT_OBJECT_TYPE])
     affected_ids_by_type = defaultdict(set)
     for parent_type, parent_id, _ in sf_db.get_changed_parent_ids_by_type(
         updated_ids, parent_types
     ):
         affected_ids_by_type[parent_type].add(parent_id)
-    assert "Account" in affected_ids_by_type, "Account type not in affected_ids_by_type"
     assert (
-        _VALID_SALESFORCE_IDS[0] in affected_ids_by_type["Account"]
+        ACCOUNT_OBJECT_TYPE in affected_ids_by_type
+    ), "Account type not in affected_ids_by_type"
+    assert (
+        _VALID_SALESFORCE_IDS[0] in affected_ids_by_type[ACCOUNT_OBJECT_TYPE]
     ), "Parent of updated child not included"
 
     # Test Case 3: Both direct and indirect affects
     updated_ids = [_VALID_SALESFORCE_IDS[1], _VALID_SALESFORCE_IDS[40]]  # Both cases
-    parent_types = set(["Account"])
+    parent_types = set([ACCOUNT_OBJECT_TYPE])
     affected_ids_by_type = defaultdict(set)
     for parent_type, parent_id, _ in sf_db.get_changed_parent_ids_by_type(
         updated_ids, parent_types
     ):
         affected_ids_by_type[parent_type].add(parent_id)
-    assert "Account" in affected_ids_by_type, "Account type not in affected_ids_by_type"
-    affected_ids = affected_ids_by_type["Account"]
+    assert (
+        ACCOUNT_OBJECT_TYPE in affected_ids_by_type
+    ), "Account type not in affected_ids_by_type"
+    affected_ids = affected_ids_by_type[ACCOUNT_OBJECT_TYPE]
     assert len(affected_ids) == 2, "Expected exactly two affected parent IDs"
     assert _VALID_SALESFORCE_IDS[0] in affected_ids, "Parent of child not included"
     assert _VALID_SALESFORCE_IDS[1] in affected_ids, "Direct parent ID not included"
@@ -929,7 +940,7 @@ def _get_child_records_by_id_query(
     object_id: str,
     sf_type: str,
     child_relationships: list[str],
-    relationships_to_fields: dict[str, list[str]],
+    relationships_to_fields: dict[str, set[str]],
 ) -> str:
     """Returns a SOQL query given the object id, type and child relationships.
 
@@ -963,7 +974,7 @@ def test_salesforce_connector_single() -> None:
 
     # this record has some opportunity child records
     parent_id = "001bm00000BXfhEAAT"
-    parent_type = "Account"
+    parent_type = ACCOUNT_OBJECT_TYPE
     parent_types = [parent_type]
 
     username = os.environ["SF_USERNAME"]
@@ -987,11 +998,11 @@ def test_salesforce_connector_single() -> None:
     child_to_parent_types: dict[str, set[str]] = (
         {}
     )  # reverse map from child to parent types
-    child_relationship_to_queryable_fields: dict[str, list[str]] = {}
+    child_relationship_to_queryable_fields: dict[str, set[str]] = {}
 
     # parent_reference_fields_by_type: dict[str, dict[str, list[str]]] = {}
 
-    # Step 1 - make a list of all the types to download (parent + direct child + "User")
+    # Step 1 - make a list of all the types to download (parent + direct child + USER_OBJECT_TYPE)
     logger.info(f"Parent object types: num={len(parent_types)} list={parent_types}")
     for parent_type_working in parent_types:
         child_types_working = sf_client.get_children_of_sf_type(parent_type_working)
@@ -1035,8 +1046,8 @@ def test_salesforce_connector_single() -> None:
     result = sf_client.query(query)
     records = result["records"]
     record = records[0]
-    assert record["attributes"]["type"] == "Account"
-    parent_last_modified_date = record.get("LastModifiedDate", "")
+    assert record["attributes"]["type"] == ACCOUNT_OBJECT_TYPE
+    parent_last_modified_date = record.get(MODIFIED_FIELD, "")
     parent_semantic_identifier = record.get("Name", "Unknown Object")
     parent_last_modified_by_id = record.get("LastModifiedById")
 
@@ -1163,9 +1174,9 @@ def test_salesforce_connector_single() -> None:
     # get user relationship if present
     primary_owner_list = None
     if parent_last_modified_by_id:
-        queryable_user_fields = sf_client.get_queryable_fields_by_type("User")
+        queryable_user_fields = sf_client.get_queryable_fields_by_type(USER_OBJECT_TYPE)
         query = get_object_by_id_query(
-            parent_last_modified_by_id, "User", queryable_user_fields
+            parent_last_modified_by_id, USER_OBJECT_TYPE, queryable_user_fields
         )
         result = sf_client.query(query)
         user_record = result["records"][0]
diff --git a/web/src/components/Field.tsx b/web/src/components/Field.tsx
index 7682c014c2c..5a5bbba12ae 100644
--- a/web/src/components/Field.tsx
+++ b/web/src/components/Field.tsx
@@ -82,8 +82,13 @@ export function LabelWithTooltip({
 }
 
 export function SubLabel({ children }: { children: string | JSX.Element }) {
+  // Add whitespace-pre-wrap for multiline descriptions (when children is a string with newlines)
+  const hasNewlines = typeof children === "string" && children.includes("\n");
+
   return (
-    <div className="text-sm text-neutral-600 dark:text-neutral-300 mb-2">
+    <div
+      className={`text-sm text-neutral-600 dark:text-neutral-300 mb-2 ${hasNewlines ? "whitespace-pre-wrap" : ""}`}
+    >
       {children}
     </div>
   );
diff --git a/web/src/lib/connectors/connectors.tsx b/web/src/lib/connectors/connectors.tsx
index 308d53956cf..e8329aeaf64 100644
--- a/web/src/lib/connectors/connectors.tsx
+++ b/web/src/lib/connectors/connectors.tsx
@@ -599,14 +599,59 @@ export const connectorConfigs: Record<
     description: "Configure Salesforce connector",
     values: [
       {
-        type: "list",
-        query: "Enter requested objects:",
-        label: "Requested Objects",
-        name: "requested_objects",
+        type: "tab",
+        name: "salesforce_config_type",
+        label: "Configuration Type",
         optional: true,
-        description: `Specify the Salesforce object types you want us to index. If unsure, don't specify any objects and Onyx will default to indexing by 'Account'.
+        tabs: [
+          {
+            value: "simple",
+            label: "Simple",
+            fields: [
+              {
+                type: "list",
+                query: "Enter requested objects:",
+                label: "Requested Objects",
+                name: "requested_objects",
+                optional: true,
+                description: `Specify the Salesforce object types you want us to index. If unsure, don't specify any objects and Onyx will default to indexing by 'Account'.
 
 Hint: Use the singular form of the object name (e.g., 'Opportunity' instead of 'Opportunities').`,
+              },
+            ],
+          },
+          {
+            value: "advanced",
+            label: "Advanced",
+            fields: [
+              {
+                type: "text",
+                query: "Enter custom query config:",
+                label: "Custom Query Config",
+                name: "custom_query_config",
+                optional: true,
+                isTextArea: true,
+                description: `Enter a JSON configuration that precisely defines which fields and child objects to index. This gives you complete control over the data structure.
+
+Example:
+{
+  "Account": {
+    "fields": ["Id", "Name", "Industry"],
+    "associations": {
+      "Contact": {
+        "fields": ["Id", "FirstName", "LastName", "Email"],
+        "associations": {}
+      }
+    }
+  }
+}
+
+See our docs for more details.`,
+              },
+            ],
+          },
+        ],
+        defaultTab: "simple",
       },
     ],
     advanced_values: [],

From baaf765ded43f549aafe6e90e969950ed39ac237 Mon Sep 17 00:00:00 2001
From: Wenxi Onyx <wenxi@onyx.app>
Date: Wed, 6 Aug 2025 19:00:55 -0700
Subject: [PATCH 63/78] mask llm api key from logs

---
 backend/onyx/llm/chat_llm.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/backend/onyx/llm/chat_llm.py b/backend/onyx/llm/chat_llm.py
index f64417c1c6c..1ab6592629f 100644
--- a/backend/onyx/llm/chat_llm.py
+++ b/backend/onyx/llm/chat_llm.py
@@ -313,14 +313,14 @@ def __init__(
 
         self._model_kwargs = model_kwargs
 
-    def log_model_configs(self) -> None:
-        logger.debug(f"Config: {self.config}")
-
     def _safe_model_config(self) -> dict:
         dump = self.config.model_dump()
         dump["api_key"] = mask_string(dump.get("api_key", ""))
         return dump
 
+    def log_model_configs(self) -> None:
+        logger.debug(f"Config: {self._safe_model_config()}")
+
     def _record_call(self, prompt: LanguageModelInput) -> None:
         if self._long_term_logger:
             self._long_term_logger.record(

From c186ceb15aa20e6f90fac0fc9be99023799a98ea Mon Sep 17 00:00:00 2001
From: Evan Lohn <evan@danswer.ai>
Date: Thu, 7 Aug 2025 11:17:00 -0700
Subject: [PATCH 64/78] fix: remove erroneous error case and add valid error
 (#5163)

* fix: remove erroneous error case and add valid error

* also address docfetching-docprocessing limbo
---
 .../onyx/background/celery/apps/primary.py    |   2 -
 .../celery/tasks/connector_deletion/tasks.py  |   7 +-
 .../celery/tasks/docfetching/tasks.py         |  26 ++--
 .../celery/tasks/docprocessing/tasks.py       | 119 ++++++++++++------
 .../background/celery/tasks/pruning/tasks.py  |   4 -
 .../background/indexing/run_docfetching.py    |   5 +-
 backend/onyx/db/indexing_coordination.py      |   4 +-
 .../onyx/file_store/document_batch_storage.py |   3 +
 backend/onyx/redis/redis_connector.py         |  14 ++-
 backend/onyx/redis/redis_connector_index.py   | 116 -----------------
 backend/onyx/redis/redis_utils.py             |   3 -
 backend/scripts/debugging/onyx_redis.py       |   4 -
 12 files changed, 108 insertions(+), 199 deletions(-)

diff --git a/backend/onyx/background/celery/apps/primary.py b/backend/onyx/background/celery/apps/primary.py
index bd10911b9be..3123fc37095 100644
--- a/backend/onyx/background/celery/apps/primary.py
+++ b/backend/onyx/background/celery/apps/primary.py
@@ -32,7 +32,6 @@
 from onyx.redis.redis_connector_delete import RedisConnectorDelete
 from onyx.redis.redis_connector_doc_perm_sync import RedisConnectorPermissionSync
 from onyx.redis.redis_connector_ext_group_sync import RedisConnectorExternalGroupSync
-from onyx.redis.redis_connector_index import RedisConnectorIndex
 from onyx.redis.redis_connector_prune import RedisConnectorPrune
 from onyx.redis.redis_connector_stop import RedisConnectorStop
 from onyx.redis.redis_document_set import RedisDocumentSet
@@ -161,7 +160,6 @@ def on_worker_init(sender: Worker, **kwargs: Any) -> None:
     RedisUserGroup.reset_all(r)
     RedisConnectorDelete.reset_all(r)
     RedisConnectorPrune.reset_all(r)
-    RedisConnectorIndex.reset_all(r)
     RedisConnectorStop.reset_all(r)
     RedisConnectorPermissionSync.reset_all(r)
     RedisConnectorExternalGroupSync.reset_all(r)
diff --git a/backend/onyx/background/celery/tasks/connector_deletion/tasks.py b/backend/onyx/background/celery/tasks/connector_deletion/tasks.py
index 2379ebd2a0f..ea9631c969c 100644
--- a/backend/onyx/background/celery/tasks/connector_deletion/tasks.py
+++ b/backend/onyx/background/celery/tasks/connector_deletion/tasks.py
@@ -193,12 +193,7 @@ def check_for_connector_deletion_task(self: Task, *, tenant_id: str) -> bool | N
                             task_logger.info(
                                 "Timed out waiting for tasks blocking deletion. Resetting blocking fences."
                             )
-                            search_settings_list = get_all_search_settings(db_session)
-                            for search_settings in search_settings_list:
-                                redis_connector_index = redis_connector.new_index(
-                                    search_settings.id
-                                )
-                                redis_connector_index.reset()
+
                             redis_connector.prune.reset()
                             redis_connector.permissions.reset()
                             redis_connector.external_group_sync.reset()
diff --git a/backend/onyx/background/celery/tasks/docfetching/tasks.py b/backend/onyx/background/celery/tasks/docfetching/tasks.py
index bac1cb367fc..8b5f1fdf96e 100644
--- a/backend/onyx/background/celery/tasks/docfetching/tasks.py
+++ b/backend/onyx/background/celery/tasks/docfetching/tasks.py
@@ -2,7 +2,6 @@
 import os
 import time
 import traceback
-from http import HTTPStatus
 from time import sleep
 
 import sentry_sdk
@@ -22,7 +21,7 @@
 from onyx.background.indexing.job_client import SimpleJob
 from onyx.background.indexing.job_client import SimpleJobClient
 from onyx.background.indexing.job_client import SimpleJobException
-from onyx.background.indexing.run_docfetching import run_indexing_entrypoint
+from onyx.background.indexing.run_docfetching import run_docfetching_entrypoint
 from onyx.configs.constants import CELERY_INDEXING_WATCHDOG_CONNECTOR_TIMEOUT
 from onyx.configs.constants import OnyxCeleryTask
 from onyx.connectors.exceptions import ConnectorValidationError
@@ -34,7 +33,6 @@
 from onyx.db.index_attempt import mark_attempt_failed
 from onyx.db.indexing_coordination import IndexingCoordination
 from onyx.redis.redis_connector import RedisConnector
-from onyx.redis.redis_connector_index import RedisConnectorIndex
 from onyx.utils.logger import setup_logger
 from onyx.utils.variable_functionality import global_version
 from shared_configs.configs import SENTRY_DSN
@@ -156,7 +154,6 @@ def _docfetching_task(
     )
 
     redis_connector = RedisConnector(tenant_id, cc_pair_id)
-    redis_connector.new_index(search_settings_id)
 
     # TODO: remove all fences, cause all signals to be set in postgres
     if redis_connector.delete.fenced:
@@ -214,7 +211,7 @@ def _docfetching_task(
         )
 
         # This is where the heavy/real work happens
-        run_indexing_entrypoint(
+        run_docfetching_entrypoint(
             app,
             index_attempt_id,
             tenant_id,
@@ -261,7 +258,7 @@ def _docfetching_task(
 def process_job_result(
     job: SimpleJob,
     connector_source: str | None,
-    redis_connector_index: RedisConnectorIndex,
+    index_attempt_id: int,
     log_builder: ConnectorIndexingLogBuilder,
 ) -> SimpleJobResult:
     result = SimpleJobResult()
@@ -278,13 +275,11 @@ def process_job_result(
 
     # In EKS, there is an edge case where successful tasks return exit
     # code 1 in the cloud due to the set_spawn_method not sticking.
-    # We've since worked around this, but the following is a safe way to
-    # work around this issue. Basically, we ignore the job error state
-    # if the completion signal is OK.
-    status_int = redis_connector_index.get_completion()
-    if status_int:
-        status_enum = HTTPStatus(status_int)
-        if status_enum == HTTPStatus.OK:
+    # Workaround: check that the total number of batches is set, since this only
+    # happens when docfetching completed successfully
+    with get_session_with_current_tenant() as db_session:
+        index_attempt = get_index_attempt(db_session, index_attempt_id)
+        if index_attempt and index_attempt.total_batches is not None:
             ignore_exitcode = True
 
     if ignore_exitcode:
@@ -458,9 +453,6 @@ def docfetching_proxy_task(
         )
     )
 
-    redis_connector = RedisConnector(tenant_id, cc_pair_id)
-    redis_connector_index = redis_connector.new_index(search_settings_id)
-
     # Track the last time memory info was emitted
     last_memory_emit_time = 0.0
 
@@ -487,7 +479,7 @@ def docfetching_proxy_task(
             if job.done():
                 try:
                     result = process_job_result(
-                        job, result.connector_source, redis_connector_index, log_builder
+                        job, result.connector_source, index_attempt_id, log_builder
                     )
                 except Exception:
                     task_logger.exception(
diff --git a/backend/onyx/background/celery/tasks/docprocessing/tasks.py b/backend/onyx/background/celery/tasks/docprocessing/tasks.py
index e67e45ef094..501a3b49042 100644
--- a/backend/onyx/background/celery/tasks/docprocessing/tasks.py
+++ b/backend/onyx/background/celery/tasks/docprocessing/tasks.py
@@ -4,7 +4,6 @@
 from datetime import datetime
 from datetime import timedelta
 from datetime import timezone
-from http import HTTPStatus
 from typing import Any
 
 from celery import shared_task
@@ -16,6 +15,8 @@
 from sqlalchemy.orm import Session
 
 from onyx.background.celery.apps.app_base import task_logger
+from onyx.background.celery.celery_redis import celery_find_task
+from onyx.background.celery.celery_redis import celery_get_unacked_task_ids
 from onyx.background.celery.celery_utils import httpx_init_vespa_pool
 from onyx.background.celery.tasks.beat_schedule import CLOUD_BEAT_MULTIPLIER_DEFAULT
 from onyx.background.celery.tasks.docprocessing.heartbeat import start_heartbeat
@@ -66,6 +67,7 @@
 from onyx.db.index_attempt import mark_attempt_partially_succeeded
 from onyx.db.index_attempt import mark_attempt_succeeded
 from onyx.db.indexing_coordination import CoordinationStatus
+from onyx.db.indexing_coordination import INDEXING_PROGRESS_TIMEOUT_HOURS
 from onyx.db.indexing_coordination import IndexingCoordination
 from onyx.db.models import IndexAttempt
 from onyx.db.search_settings import get_active_search_settings_list
@@ -102,6 +104,7 @@
 logger = setup_logger()
 
 USER_FILE_INDEXING_LIMIT = 100
+DOCPROCESSING_STALL_TIMEOUT_MULTIPLIER = 4
 
 
 def _get_fence_validation_block_expiration() -> int:
@@ -257,7 +260,7 @@ def build(self, msg: str, **kwargs: Any) -> str:
 
 
 def monitor_indexing_attempt_progress(
-    attempt: IndexAttempt, tenant_id: str, db_session: Session
+    attempt: IndexAttempt, tenant_id: str, db_session: Session, task: Task
 ) -> None:
     """
     TODO: rewrite this docstring
@@ -316,7 +319,9 @@ def monitor_indexing_attempt_progress(
 
     # Check task completion using Celery
     try:
-        check_indexing_completion(attempt.id, coordination_status, storage, tenant_id)
+        check_indexing_completion(
+            attempt.id, coordination_status, storage, tenant_id, task
+        )
     except Exception as e:
         logger.exception(
             f"Failed to monitor document processing completion: "
@@ -350,6 +355,7 @@ def check_indexing_completion(
     coordination_status: CoordinationStatus,
     storage: DocumentBatchStorage,
     tenant_id: str,
+    task: Task,
 ) -> None:
 
     logger.info(
@@ -376,21 +382,78 @@ def check_indexing_completion(
 
     # Update progress tracking and check for stalls
     with get_session_with_current_tenant() as db_session:
-        # Update progress tracking
+        stalled_timeout_hours = INDEXING_PROGRESS_TIMEOUT_HOURS
+        # Index attempts that are waiting between docfetching and
+        # docprocessing get a generous stalling timeout
+        if batches_total is not None and batches_processed == 0:
+            stalled_timeout_hours = (
+                stalled_timeout_hours * DOCPROCESSING_STALL_TIMEOUT_MULTIPLIER
+            )
+
         timed_out = not IndexingCoordination.update_progress_tracking(
-            db_session, index_attempt_id, batches_processed
+            db_session,
+            index_attempt_id,
+            batches_processed,
+            timeout_hours=stalled_timeout_hours,
         )
 
         # Check for stalls (3-6 hour timeout). Only applies to in-progress attempts.
         attempt = get_index_attempt(db_session, index_attempt_id)
-        if timed_out and attempt and attempt.status == IndexingStatus.IN_PROGRESS:
-            logger.error(
-                f"Indexing attempt {index_attempt_id} has been indexing for 3-6 hours without progress. "
-                f"Marking it as failed."
-            )
-            mark_attempt_failed(
-                index_attempt_id, db_session, failure_reason="Stalled indexing"
-            )
+        if attempt and timed_out:
+            if attempt.status == IndexingStatus.IN_PROGRESS:
+                logger.error(
+                    f"Indexing attempt {index_attempt_id} has been indexing for "
+                    f"{stalled_timeout_hours//2}-{stalled_timeout_hours} hours without progress. "
+                    f"Marking it as failed."
+                )
+                mark_attempt_failed(
+                    index_attempt_id, db_session, failure_reason="Stalled indexing"
+                )
+            elif (
+                attempt.status == IndexingStatus.NOT_STARTED and attempt.celery_task_id
+            ):
+                # Check if the task exists in the celery queue
+                # This handles the case where Redis dies after task creation but before task execution
+                redis_celery = task.app.broker_connection().channel().client  # type: ignore
+                task_exists = celery_find_task(
+                    attempt.celery_task_id,
+                    OnyxCeleryQueues.CONNECTOR_DOC_FETCHING,
+                    redis_celery,
+                )
+                unacked_task_ids = celery_get_unacked_task_ids(
+                    OnyxCeleryQueues.CONNECTOR_DOC_FETCHING, redis_celery
+                )
+
+                if not task_exists and attempt.celery_task_id not in unacked_task_ids:
+                    # there is a race condition where the docfetching task has been taken off
+                    # the queues (i.e. started) but the indexing attempt still has a status of
+                    # Not Started because the switch to in progress takes like 0.1 seconds.
+                    # sleep a bit and confirm that the attempt is still not in progress.
+                    time.sleep(1)
+                    attempt = get_index_attempt(db_session, index_attempt_id)
+                    if attempt and attempt.status == IndexingStatus.NOT_STARTED:
+                        logger.error(
+                            f"Task {attempt.celery_task_id} attached to indexing attempt "
+                            f"{index_attempt_id} does not exist in the queue. "
+                            f"Marking indexing attempt as failed."
+                        )
+                        mark_attempt_failed(
+                            index_attempt_id,
+                            db_session,
+                            failure_reason="Task not in queue",
+                        )
+            else:
+                logger.info(
+                    f"Indexing attempt {index_attempt_id} is {attempt.status}. 3-6 hours without heartbeat "
+                    "but task is in the queue. Likely underprovisioned docfetching worker."
+                )
+                # Update last progress time so we won't time out again for another 3 hours
+                IndexingCoordination.update_progress_tracking(
+                    db_session,
+                    index_attempt_id,
+                    batches_processed,
+                    force_update_progress=True,
+                )
 
     # check again on the next check_for_indexing task
     # TODO: on the cloud this is currently 25 minutes at most, which
@@ -450,15 +513,6 @@ def check_indexing_completion(
                     db_session=db_session,
                 )
 
-            # TODO: make it so we don't need this (might already be true)
-            redis_connector = RedisConnector(
-                tenant_id, attempt.connector_credential_pair_id
-            )
-            redis_connector_index = redis_connector.new_index(
-                attempt.search_settings_id
-            )
-            redis_connector_index.set_generator_complete(HTTPStatus.OK.value)
-
     # Clean up FileStore storage (still needed for document batches during transition)
     try:
         logger.info(f"Cleaning up storage after indexing completion: {storage}")
@@ -812,7 +866,9 @@ def check_for_indexing(self: Task, *, tenant_id: str) -> int | None:
 
             for attempt in active_attempts:
                 try:
-                    monitor_indexing_attempt_progress(attempt, tenant_id, db_session)
+                    monitor_indexing_attempt_progress(
+                        attempt, tenant_id, db_session, self
+                    )
                 except Exception:
                     task_logger.exception(f"Error monitoring attempt {attempt.id}")
 
@@ -1086,12 +1142,8 @@ def _docprocessing_task(
                     f"Index attempt {index_attempt_id} is not running, status {index_attempt.status}"
                 )
 
-            redis_connector_index = redis_connector.new_index(
-                index_attempt.search_settings.id
-            )
-
             cross_batch_db_lock: RedisLock = r.lock(
-                redis_connector_index.db_lock_key,
+                redis_connector.db_lock_key(index_attempt.search_settings.id),
                 timeout=CELERY_INDEXING_LOCK_TIMEOUT,
                 thread_local=False,
             )
@@ -1231,17 +1283,6 @@ def _docprocessing_task(
             f"attempt={index_attempt_id} "
         )
 
-        # on failure, signal completion with an error to unblock the watchdog
-        with get_session_with_current_tenant() as db_session:
-            index_attempt = get_index_attempt(db_session, index_attempt_id)
-            if index_attempt and index_attempt.search_settings:
-                redis_connector_index = redis_connector.new_index(
-                    index_attempt.search_settings.id
-                )
-                redis_connector_index.set_generator_complete(
-                    HTTPStatus.INTERNAL_SERVER_ERROR.value
-                )
-
         raise
     finally:
         if per_batch_lock and per_batch_lock.owned():
diff --git a/backend/onyx/background/celery/tasks/pruning/tasks.py b/backend/onyx/background/celery/tasks/pruning/tasks.py
index 71156a96e36..16a51fcf397 100644
--- a/backend/onyx/background/celery/tasks/pruning/tasks.py
+++ b/backend/onyx/background/celery/tasks/pruning/tasks.py
@@ -47,7 +47,6 @@
 from onyx.db.enums import SyncStatus
 from onyx.db.enums import SyncType
 from onyx.db.models import ConnectorCredentialPair
-from onyx.db.search_settings import get_current_search_settings
 from onyx.db.sync_record import insert_sync_record
 from onyx.db.sync_record import update_sync_record_status
 from onyx.db.tag import delete_orphan_tags__no_commit
@@ -519,9 +518,6 @@ def connector_pruning_generator_task(
                 cc_pair.credential,
             )
 
-            search_settings = get_current_search_settings(db_session)
-            redis_connector.new_index(search_settings.id)
-
             callback = PruneCallback(
                 0,
                 redis_connector,
diff --git a/backend/onyx/background/indexing/run_docfetching.py b/backend/onyx/background/indexing/run_docfetching.py
index 4a323129225..8a55644ec9f 100644
--- a/backend/onyx/background/indexing/run_docfetching.py
+++ b/backend/onyx/background/indexing/run_docfetching.py
@@ -832,7 +832,7 @@ def _run_indexing(
                 )
 
 
-def run_indexing_entrypoint(
+def run_docfetching_entrypoint(
     app: Celery,
     index_attempt_id: int,
     tenant_id: str,
@@ -1350,6 +1350,9 @@ def reissue_old_batches(
         )
         path_info = batch_storage.extract_path_info(batch_id)
         if path_info is None:
+            logger.warning(
+                f"Could not extract path info from batch {batch_id}, skipping"
+            )
             continue
         if path_info.cc_pair_id != cc_pair_id:
             raise RuntimeError(f"Batch {batch_id} is not for cc pair {cc_pair_id}")
diff --git a/backend/onyx/db/indexing_coordination.py b/backend/onyx/db/indexing_coordination.py
index 0da48e9b948..6579eae84be 100644
--- a/backend/onyx/db/indexing_coordination.py
+++ b/backend/onyx/db/indexing_coordination.py
@@ -267,6 +267,7 @@ def update_progress_tracking(
         index_attempt_id: int,
         current_batches_completed: int,
         timeout_hours: int = INDEXING_PROGRESS_TIMEOUT_HOURS,
+        force_update_progress: bool = False,
     ) -> bool:
         """
         Update progress tracking for stall detection.
@@ -281,7 +282,8 @@ def update_progress_tracking(
         current_time = get_db_current_time(db_session)
 
         # No progress - check if this is the first time tracking
-        if attempt.last_progress_time is None:
+        # or if the caller wants to simulate guaranteed progress
+        if attempt.last_progress_time is None or force_update_progress:
             # First time tracking - initialize
             attempt.last_progress_time = current_time
             attempt.last_batches_completed_count = current_batches_completed
diff --git a/backend/onyx/file_store/document_batch_storage.py b/backend/onyx/file_store/document_batch_storage.py
index 1ac6b466afe..83f95aba51f 100644
--- a/backend/onyx/file_store/document_batch_storage.py
+++ b/backend/onyx/file_store/document_batch_storage.py
@@ -196,6 +196,9 @@ def update_old_batches_to_new_index_attempt(self, batch_names: list[str]) -> Non
         for batch_file_name in batch_names:
             path_info = self.extract_path_info(batch_file_name)
             if path_info is None:
+                logger.warning(
+                    f"Could not extract path info from batch file: {batch_file_name}"
+                )
                 continue
             new_batch_file_name = self._get_batch_file_name(path_info.batch_num)
             self.file_store.change_file_id(batch_file_name, new_batch_file_name)
diff --git a/backend/onyx/redis/redis_connector.py b/backend/onyx/redis/redis_connector.py
index 393e410f06b..be5ddb9a83d 100644
--- a/backend/onyx/redis/redis_connector.py
+++ b/backend/onyx/redis/redis_connector.py
@@ -3,7 +3,6 @@
 from onyx.redis.redis_connector_delete import RedisConnectorDelete
 from onyx.redis.redis_connector_doc_perm_sync import RedisConnectorPermissionSync
 from onyx.redis.redis_connector_ext_group_sync import RedisConnectorExternalGroupSync
-from onyx.redis.redis_connector_index import RedisConnectorIndex
 from onyx.redis.redis_connector_prune import RedisConnectorPrune
 from onyx.redis.redis_connector_stop import RedisConnectorStop
 from onyx.redis.redis_pool import get_redis_client
@@ -31,11 +30,6 @@ def __init__(self, tenant_id: str, cc_pair_id: int) -> None:
             tenant_id, cc_pair_id, self.redis
         )
 
-    def new_index(self, search_settings_id: int) -> RedisConnectorIndex:
-        return RedisConnectorIndex(
-            self.tenant_id, self.cc_pair_id, search_settings_id, self.redis
-        )
-
     @staticmethod
     def get_id_from_fence_key(key: str) -> str | None:
         """
@@ -81,3 +75,11 @@ def get_id_from_task_id(task_id: str) -> str | None:
 
         object_id = parts[1]
         return object_id
+
+    def db_lock_key(self, search_settings_id: int) -> str:
+        """
+        Key for the db lock for an indexing attempt.
+        Prevents multiple modifications to the current indexing attempt row
+        from multiple docfetching/docprocessing tasks.
+        """
+        return f"da_lock:indexing:db_{self.cc_pair_id}/{search_settings_id}"
diff --git a/backend/onyx/redis/redis_connector_index.py b/backend/onyx/redis/redis_connector_index.py
index a4e2b149d40..2ab1d19dfdf 100644
--- a/backend/onyx/redis/redis_connector_index.py
+++ b/backend/onyx/redis/redis_connector_index.py
@@ -1,126 +1,10 @@
 from datetime import datetime
-from typing import cast
 
-import redis
 from pydantic import BaseModel
 
-from onyx.configs.constants import CELERY_INDEXING_WATCHDOG_CONNECTOR_TIMEOUT
-
 
 class RedisConnectorIndexPayload(BaseModel):
     index_attempt_id: int | None
     started: datetime | None
     submitted: datetime
     celery_task_id: str | None
-
-
-class RedisConnectorIndex:
-    """Manages interactions with redis for indexing tasks. Should only be accessed
-    through RedisConnector."""
-
-    PREFIX = "connectorindexing"
-    FENCE_PREFIX = f"{PREFIX}_fence"  # "connectorindexing_fence"
-    GENERATOR_TASK_PREFIX = PREFIX + "+generator"  # "connectorindexing+generator_fence"
-    GENERATOR_PROGRESS_PREFIX = (
-        PREFIX + "_generator_progress"
-    )  # connectorindexing_generator_progress
-    GENERATOR_COMPLETE_PREFIX = (
-        PREFIX + "_generator_complete"
-    )  # connectorindexing_generator_complete
-
-    GENERATOR_LOCK_PREFIX = "da_lock:indexing:docfetching"
-    FILESTORE_LOCK_PREFIX = "da_lock:indexing:filestore"
-    DB_LOCK_PREFIX = "da_lock:indexing:db"
-    PER_WORKER_LOCK_PREFIX = "da_lock:indexing:per_worker"
-
-    TERMINATE_PREFIX = PREFIX + "_terminate"  # connectorindexing_terminate
-    TERMINATE_TTL = 600
-
-    # used to signal the overall workflow is still active
-    # it's impossible to get the exact state of the system at a single point in time
-    # so we need a signal with a TTL to bridge gaps in our checks
-    ACTIVE_PREFIX = PREFIX + "_active"
-    ACTIVE_TTL = 3600
-
-    # used to signal that the watchdog is running
-    WATCHDOG_PREFIX = PREFIX + "_watchdog"
-    WATCHDOG_TTL = 300
-
-    # used to signal that the connector itself is still running
-    CONNECTOR_ACTIVE_PREFIX = PREFIX + "_connector_active"
-    CONNECTOR_ACTIVE_TTL = CELERY_INDEXING_WATCHDOG_CONNECTOR_TIMEOUT
-
-    def __init__(
-        self,
-        tenant_id: str,
-        cc_pair_id: int,
-        search_settings_id: int,
-        redis: redis.Redis,
-    ) -> None:
-        self.tenant_id: str = tenant_id
-        self.cc_pair_id = cc_pair_id
-        self.search_settings_id = search_settings_id
-        self.redis = redis
-
-        self.generator_complete_key = (
-            f"{self.GENERATOR_COMPLETE_PREFIX}_{cc_pair_id}/{search_settings_id}"
-        )
-        self.filestore_lock_key = (
-            f"{self.FILESTORE_LOCK_PREFIX}_{cc_pair_id}/{search_settings_id}"
-        )
-        self.generator_lock_key = (
-            f"{self.GENERATOR_LOCK_PREFIX}_{cc_pair_id}/{search_settings_id}"
-        )
-        self.per_worker_lock_key = (
-            f"{self.PER_WORKER_LOCK_PREFIX}_{cc_pair_id}/{search_settings_id}"
-        )
-        self.db_lock_key = f"{self.DB_LOCK_PREFIX}_{cc_pair_id}/{search_settings_id}"
-        self.terminate_key = (
-            f"{self.TERMINATE_PREFIX}_{cc_pair_id}/{search_settings_id}"
-        )
-
-    def set_generator_complete(self, payload: int | None) -> None:
-        if not payload:
-            self.redis.delete(self.generator_complete_key)
-            return
-
-        self.redis.set(self.generator_complete_key, payload)
-
-    def generator_clear(self) -> None:
-        self.redis.delete(self.generator_complete_key)
-
-    def get_completion(self) -> int | None:
-        bytes = self.redis.get(self.generator_complete_key)
-        if bytes is None:
-            return None
-
-        status = int(cast(int, bytes))
-        return status
-
-    def reset(self) -> None:
-        self.redis.delete(self.filestore_lock_key)
-        self.redis.delete(self.db_lock_key)
-        self.redis.delete(self.generator_lock_key)
-        self.redis.delete(self.generator_complete_key)
-
-    @staticmethod
-    def reset_all(r: redis.Redis) -> None:
-        """Deletes all redis values for all connectors"""
-        # leaving these temporarily for backwards compat, TODO: remove
-        for key in r.scan_iter(RedisConnectorIndex.CONNECTOR_ACTIVE_PREFIX + "*"):
-            r.delete(key)
-
-        for key in r.scan_iter(RedisConnectorIndex.ACTIVE_PREFIX + "*"):
-            r.delete(key)
-
-        for key in r.scan_iter(RedisConnectorIndex.FILESTORE_LOCK_PREFIX + "*"):
-            r.delete(key)
-
-        for key in r.scan_iter(RedisConnectorIndex.GENERATOR_COMPLETE_PREFIX + "*"):
-            r.delete(key)
-
-        for key in r.scan_iter(RedisConnectorIndex.GENERATOR_PROGRESS_PREFIX + "*"):
-            r.delete(key)
-
-        for key in r.scan_iter(RedisConnectorIndex.FENCE_PREFIX + "*"):
-            r.delete(key)
diff --git a/backend/onyx/redis/redis_utils.py b/backend/onyx/redis/redis_utils.py
index 1403238513a..22075fd32de 100644
--- a/backend/onyx/redis/redis_utils.py
+++ b/backend/onyx/redis/redis_utils.py
@@ -1,6 +1,5 @@
 from onyx.redis.redis_connector_delete import RedisConnectorDelete
 from onyx.redis.redis_connector_doc_perm_sync import RedisConnectorPermissionSync
-from onyx.redis.redis_connector_index import RedisConnectorIndex
 from onyx.redis.redis_connector_prune import RedisConnectorPrune
 from onyx.redis.redis_document_set import RedisDocumentSet
 from onyx.redis.redis_usergroup import RedisUserGroup
@@ -16,8 +15,6 @@ def is_fence(key_bytes: bytes) -> bool:
         return True
     if key_str.startswith(RedisConnectorPrune.FENCE_PREFIX):
         return True
-    if key_str.startswith(RedisConnectorIndex.FENCE_PREFIX):
-        return True
     if key_str.startswith(RedisConnectorPermissionSync.FENCE_PREFIX):
         return True
 
diff --git a/backend/scripts/debugging/onyx_redis.py b/backend/scripts/debugging/onyx_redis.py
index 99729667114..53e98137e7f 100644
--- a/backend/scripts/debugging/onyx_redis.py
+++ b/backend/scripts/debugging/onyx_redis.py
@@ -22,7 +22,6 @@
 from onyx.db.engine.sql_engine import get_session_with_tenant
 from onyx.db.users import get_user_by_email
 from onyx.redis.redis_connector import RedisConnector
-from onyx.redis.redis_connector_index import RedisConnectorIndex
 from onyx.redis.redis_pool import RedisPool
 from shared_configs.configs import MULTI_TENANT
 from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
@@ -130,9 +129,6 @@ def onyx_redis(
         logger.info(f"Purging locks associated with deleting cc_pair={cc_pair_id}.")
         redis_connector = RedisConnector(tenant_id, cc_pair_id)
 
-        match_pattern = f"{tenant_id}:{RedisConnectorIndex.FENCE_PREFIX}_{cc_pair_id}/*"
-        purge_by_match_and_type(match_pattern, "string", batch, dry_run, r)
-
         redis_delete_if_exists_helper(
             f"{tenant_id}:{redis_connector.prune.fence_key}", dry_run, r
         )

From 2c923a7055c3be0409d5fbf15b9ddd10120211bb Mon Sep 17 00:00:00 2001
From: Wenxi <wenxi@onyx.app>
Date: Thu, 7 Aug 2025 12:35:46 -0700
Subject: [PATCH 65/78] feat: support gpt5 models (#5169)

* support gpt5 models

* gpt5mini visible
---
 backend/onyx/llm/chat_llm.py             |  6 +++++-
 backend/onyx/llm/llm_provider_options.py | 12 +++++++++++-
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/backend/onyx/llm/chat_llm.py b/backend/onyx/llm/chat_llm.py
index 1ab6592629f..1b6586b45ba 100644
--- a/backend/onyx/llm/chat_llm.py
+++ b/backend/onyx/llm/chat_llm.py
@@ -397,7 +397,11 @@ def _completion(
                 # streaming choice
                 stream=stream,
                 # model params
-                temperature=self._temperature,
+                temperature=(
+                    1
+                    if self.config.model_name in ["gpt-5", "gpt-5-mini", "gpt-5-nano"]
+                    else self._temperature
+                ),
                 timeout=timeout_override or self._timeout,
                 # For now, we don't support parallel tool calls
                 # NOTE: we can't pass this in if tools are not specified
diff --git a/backend/onyx/llm/llm_provider_options.py b/backend/onyx/llm/llm_provider_options.py
index 502c5b5ebc4..d6e2924951b 100644
--- a/backend/onyx/llm/llm_provider_options.py
+++ b/backend/onyx/llm/llm_provider_options.py
@@ -47,6 +47,9 @@ class WellKnownLLMProviderDescriptor(BaseModel):
 
 OPENAI_PROVIDER_NAME = "openai"
 OPEN_AI_MODEL_NAMES = [
+    "gpt-5",
+    "gpt-5-mini",
+    "gpt-5-nano",
     "o4-mini",
     "o3-mini",
     "o1-mini",
@@ -73,7 +76,14 @@ class WellKnownLLMProviderDescriptor(BaseModel):
     "gpt-3.5-turbo-16k-0613",
     "gpt-3.5-turbo-0301",
 ]
-OPEN_AI_VISIBLE_MODEL_NAMES = ["o1", "o3-mini", "gpt-4o", "gpt-4o-mini"]
+OPEN_AI_VISIBLE_MODEL_NAMES = [
+    "gpt-5",
+    "gpt-5-mini",
+    "o1",
+    "o3-mini",
+    "gpt-4o",
+    "gpt-4o-mini",
+]
 
 BEDROCK_PROVIDER_NAME = "bedrock"
 # need to remove all the weird "bedrock/eu-central-1/anthropic.claude-v1" named

From 14f1d85b43418bd96ae6806c1c6ba42a51f6056d Mon Sep 17 00:00:00 2001
From: Chris Weaver <chris@onyx.app>
Date: Thu, 7 Aug 2025 16:49:18 -0700
Subject: [PATCH 66/78] Make starter messages visible on smaller screens
 (#5170)

---
 web/src/app/chat/ChatPage.tsx                    | 2 +-
 web/src/components/assistants/StarterMessage.tsx | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/web/src/app/chat/ChatPage.tsx b/web/src/app/chat/ChatPage.tsx
index 475e0102d58..ae62d93a704 100644
--- a/web/src/app/chat/ChatPage.tsx
+++ b/web/src/app/chat/ChatPage.tsx
@@ -2834,7 +2834,7 @@ export function ChatPage({
                             currentSessionChatState == "input" &&
                             !loadingError &&
                             !submittedMessage && (
-                              <div className="h-full  w-[95%] mx-auto flex flex-col justify-center items-center">
+                              <div className="h-full w-[95%] mx-auto flex flex-col justify-center items-center">
                                 <ChatIntro selectedPersona={liveAssistant} />
 
                                 {currentPersona && (
diff --git a/web/src/components/assistants/StarterMessage.tsx b/web/src/components/assistants/StarterMessage.tsx
index b1924b8edc3..6cea6f0c910 100644
--- a/web/src/components/assistants/StarterMessage.tsx
+++ b/web/src/components/assistants/StarterMessage.tsx
@@ -15,7 +15,7 @@ export function StarterMessages({
     <div
       key={-4}
       className={`
-        short:hidden
+        very-short:hidden
         mx-auto
         w-full
         ${

From 066172e9a81b07ef814fd6659951dd1625c7d6c0 Mon Sep 17 00:00:00 2001
From: Evan Lohn <evan@danswer.ai>
Date: Thu, 7 Aug 2025 16:26:02 -0700
Subject: [PATCH 67/78] feat: office parsing markitdown (#5115)

* switch to markitdown untested

* passing tests

* reset file

* dotenv version

* docs

* add test file

* add doc

* fix integration test
---
 .../background/indexing/run_docfetching.py    |   6 +-
 .../onyx/file_processing/extract_file_text.py | 149 +++++++-----------
 backend/onyx/file_processing/html_utils.py    |   3 +-
 backend/requirements/default.txt              |   4 +-
 .../common_utils/test_files/three_images.docx | Bin 0 -> 48578 bytes
 .../image_indexing/test_indexing_images.py    | 110 +++++++++++++
 6 files changed, 178 insertions(+), 94 deletions(-)
 create mode 100644 backend/tests/integration/common_utils/test_files/three_images.docx

diff --git a/backend/onyx/background/indexing/run_docfetching.py b/backend/onyx/background/indexing/run_docfetching.py
index 8a55644ec9f..b74c7ec79cc 100644
--- a/backend/onyx/background/indexing/run_docfetching.py
+++ b/backend/onyx/background/indexing/run_docfetching.py
@@ -226,8 +226,12 @@ def _check_connector_and_attempt_status(
         raise ConnectorStopSignal(f"Index attempt {index_attempt_id} was canceled")
 
     if index_attempt_loop.status != IndexingStatus.IN_PROGRESS:
+        error_str = ""
+        if index_attempt_loop.error_msg:
+            error_str = f" Original error: {index_attempt_loop.error_msg}"
+
         raise RuntimeError(
-            f"Index Attempt is not running, status is {index_attempt_loop.status}"
+            f"Index Attempt is not running, status is {index_attempt_loop.status}.{error_str}"
         )
 
     if index_attempt_loop.celery_task_id is None:
diff --git a/backend/onyx/file_processing/extract_file_text.py b/backend/onyx/file_processing/extract_file_text.py
index 8f2a7708350..a4b741577b3 100644
--- a/backend/onyx/file_processing/extract_file_text.py
+++ b/backend/onyx/file_processing/extract_file_text.py
@@ -17,11 +17,11 @@
 from zipfile import BadZipFile
 
 import chardet
-import docx  # type: ignore
-import openpyxl  # type: ignore
-import pptx  # type: ignore
 from docx import Document as DocxDocument
 from fastapi import UploadFile
+from markitdown import FileConversionException
+from markitdown import MarkItDown
+from markitdown import UnsupportedFormatException
 from PIL import Image
 from pypdf import PdfReader
 from pypdf.errors import PdfStreamError
@@ -83,11 +83,6 @@
     "image/webp",
 ]
 
-KNOWN_OPENPYXL_BUGS = [
-    "Value must be either numerical or a string containing a wildcard",
-    "File contains no valid workbook part",
-]
-
 
 class OnyxExtensionType(IntFlag):
     Plain = auto()
@@ -149,6 +144,13 @@ def is_macos_resource_fork_file(file_name: str) -> bool:
     )
 
 
+def to_bytesio(stream: IO[bytes]) -> BytesIO:
+    if isinstance(stream, BytesIO):
+        return stream
+    data = stream.read()  # consumes the stream!
+    return BytesIO(data)
+
+
 def load_files_from_zip(
     zip_file_io: IO,
     ignore_macos_resource_fork_files: bool = True,
@@ -305,19 +307,38 @@ def read_pdf_file(
     return "", metadata, []
 
 
+def extract_docx_images(docx_bytes: IO[Any]) -> list[tuple[bytes, str]]:
+    """
+    Given the bytes of a docx file, extract all the images.
+    Returns a list of tuples (image_bytes, image_name).
+    """
+    out = []
+    try:
+        with zipfile.ZipFile(docx_bytes) as z:
+            for name in z.namelist():
+                if name.startswith("word/media/"):
+                    out.append((z.read(name), name.split("/")[-1]))
+    except Exception:
+        logger.exception("Failed to extract all docx images")
+    return out
+
+
 def docx_to_text_and_images(
     file: IO[Any], file_name: str = ""
 ) -> tuple[str, Sequence[tuple[bytes, str]]]:
     """
-    Extract text from a docx. If embed_images=True, also extract inline images.
+    Extract text from a docx.
     Return (text_content, list_of_images).
     """
-    paragraphs = []
-    embedded_images: list[tuple[bytes, str]] = []
-
+    md = MarkItDown(enable_plugins=False)
     try:
-        doc = docx.Document(file)
-    except (BadZipFile, ValueError) as e:
+        doc = md.convert(to_bytesio(file))
+    except (
+        BadZipFile,
+        ValueError,
+        FileConversionException,
+        UnsupportedFormatException,
+    ) as e:
         logger.warning(
             f"Failed to extract docx {file_name or 'docx file'}: {e}. Attempting to read as text file."
         )
@@ -330,96 +351,44 @@ def docx_to_text_and_images(
         )
         return text_content_raw or "", []
 
-    # Grab text from paragraphs
-    for paragraph in doc.paragraphs:
-        paragraphs.append(paragraph.text)
-
-    # Reset position so we can re-load the doc (python-docx has read the stream)
-    # Note: if python-docx has fully consumed the stream, you may need to open it again from memory.
-    # For large docs, a more robust approach is needed.
-    # This is a simplified example.
-
-    for rel_id, rel in doc.part.rels.items():
-        if "image" in rel.reltype:
-            # Skip images that are linked rather than embedded (TargetMode="External")
-            if getattr(rel, "is_external", False):
-                continue
-
-            try:
-                # image is typically in rel.target_part.blob
-                image_bytes = rel.target_part.blob
-            except ValueError:
-                # Safeguard against relationships that lack an internal target_part
-                # (e.g., external relationships or other anomalies)
-                continue
-
-            image_name = rel.target_part.partname
-            # store
-            embedded_images.append((image_bytes, os.path.basename(str(image_name))))
-
-    text_content = "\n".join(paragraphs)
-    return text_content, embedded_images
+    file.seek(0)
+    return doc.markdown, extract_docx_images(to_bytesio(file))
 
 
 def pptx_to_text(file: IO[Any], file_name: str = "") -> str:
+    md = MarkItDown(enable_plugins=False)
     try:
-        presentation = pptx.Presentation(file)
-    except BadZipFile as e:
+        presentation = md.convert(to_bytesio(file))
+    except (
+        BadZipFile,
+        ValueError,
+        FileConversionException,
+        UnsupportedFormatException,
+    ) as e:
         error_str = f"Failed to extract text from {file_name or 'pptx file'}: {e}"
         logger.warning(error_str)
         return ""
-    text_content = []
-    for slide_number, slide in enumerate(presentation.slides, start=1):
-        slide_text = f"\nSlide {slide_number}:\n"
-        for shape in slide.shapes:
-            if hasattr(shape, "text"):
-                slide_text += shape.text + "\n"
-        text_content.append(slide_text)
-    return TEXT_SECTION_SEPARATOR.join(text_content)
+    return presentation.markdown
 
 
 def xlsx_to_text(file: IO[Any], file_name: str = "") -> str:
+    md = MarkItDown(enable_plugins=False)
     try:
-        workbook = openpyxl.load_workbook(file, read_only=True)
-    except BadZipFile as e:
+        workbook = md.convert(to_bytesio(file))
+    except (
+        BadZipFile,
+        ValueError,
+        FileConversionException,
+        UnsupportedFormatException,
+    ) as e:
         error_str = f"Failed to extract text from {file_name or 'xlsx file'}: {e}"
         if file_name.startswith("~"):
             logger.debug(error_str + " (this is expected for files with ~)")
         else:
             logger.warning(error_str)
         return ""
-    except Exception as e:
-        if any(s in str(e) for s in KNOWN_OPENPYXL_BUGS):
-            logger.error(
-                f"Failed to extract text from {file_name or 'xlsx file'}. This happens due to a bug in openpyxl. {e}"
-            )
-            return ""
-        raise e
 
-    text_content = []
-    for sheet in workbook.worksheets:
-        rows = []
-        num_empty_consecutive_rows = 0
-        for row in sheet.iter_rows(min_row=1, values_only=True):
-            row_str = ",".join(str(cell or "") for cell in row)
-
-            # Only add the row if there are any values in the cells
-            if len(row_str) >= len(row):
-                rows.append(row_str)
-                num_empty_consecutive_rows = 0
-            else:
-                num_empty_consecutive_rows += 1
-
-            if num_empty_consecutive_rows > 100:
-                # handle massive excel sheets with mostly empty cells
-                logger.warning(
-                    f"Found {num_empty_consecutive_rows} empty rows in {file_name},"
-                    " skipping rest of file"
-                )
-                break
-        sheet_str = "\n".join(rows)
-        text_content.append(sheet_str)
-    return TEXT_SECTION_SEPARATOR.join(text_content)
+    return workbook.markdown
 
 
 def eml_to_text(file: IO[Any]) -> str:
@@ -472,9 +441,9 @@ def extract_file_text(
     """
     extension_to_function: dict[str, Callable[[IO[Any]], str]] = {
         ".pdf": pdf_to_text,
-        ".docx": lambda f: docx_to_text_and_images(f)[0],  # no images
-        ".pptx": pptx_to_text,
-        ".xlsx": xlsx_to_text,
+        ".docx": lambda f: docx_to_text_and_images(f, file_name)[0],  # no images
+        ".pptx": lambda f: pptx_to_text(f, file_name),
+        ".xlsx": lambda f: xlsx_to_text(f, file_name),
         ".eml": eml_to_text,
         ".epub": epub_to_text,
         ".html": parse_html_page_basic,
@@ -553,7 +522,7 @@ def extract_text_and_images(
 
         # docx example for embedded images
         if extension == ".docx":
-            text_content, images = docx_to_text_and_images(file)
+            text_content, images = docx_to_text_and_images(file, file_name)
             return ExtractionResult(
                 text_content=text_content, embedded_images=images, metadata={}
             )
diff --git a/backend/onyx/file_processing/html_utils.py b/backend/onyx/file_processing/html_utils.py
index bb2134ba18f..aaf6b78d798 100644
--- a/backend/onyx/file_processing/html_utils.py
+++ b/backend/onyx/file_processing/html_utils.py
@@ -1,6 +1,7 @@
 import re
 from copy import copy
 from dataclasses import dataclass
+from io import BytesIO
 from typing import IO
 
 import bs4
@@ -161,7 +162,7 @@ def format_document_soup(
     return strip_excessive_newlines_and_spaces(text)
 
 
-def parse_html_page_basic(text: str | IO[bytes]) -> str:
+def parse_html_page_basic(text: str | BytesIO | IO[bytes]) -> str:
     soup = bs4.BeautifulSoup(text, "html.parser")
     return format_document_soup(soup)
 
diff --git a/backend/requirements/default.txt b/backend/requirements/default.txt
index 9fc4c982f1a..fd18e6e5873 100644
--- a/backend/requirements/default.txt
+++ b/backend/requirements/default.txt
@@ -44,12 +44,12 @@ litellm==1.72.2
 lxml==5.3.0
 lxml_html_clean==0.2.2
 Mako==1.2.4
+markitdown[pdf, docx, pptx, xlsx, xls]==0.1.2
 msal==1.28.0
 nltk==3.9.1
 Office365-REST-Python-Client==2.5.9
 oauthlib==3.2.2
 openai==1.75.0
-openpyxl==3.0.10
 passlib==1.7.4
 playwright==1.41.2
 psutil==5.9.5
@@ -66,7 +66,7 @@ pypdf==5.4.0
 pytest-mock==3.12.0
 pytest-playwright==0.7.0
 python-docx==1.1.2
-python-dotenv==1.0.0
+python-dotenv==1.1.1
 python-multipart==0.0.20
 pywikibot==9.0.0
 redis==5.0.8
diff --git a/backend/tests/integration/common_utils/test_files/three_images.docx b/backend/tests/integration/common_utils/test_files/three_images.docx
new file mode 100644
index 0000000000000000000000000000000000000000..4f5f37f9f7aaaa527867dd9a926139dcd72f849d
GIT binary patch
literal 48578
zcmagFWmp~Awl<2p21|e-!QI{6-66Q^!kvY?27+sV;O_1cf(3`*PH<bed@J3%_wL^3
zoO}Ni4|9(3${1BOCsl>A95f6z1Ox;E#Bs@!?o)Bqk4#7ih$lD*2rTfZo*2Nv)!f0=
zP~FSX+(n<!)84KzSxIS|8DsFwH;OuyP<g8kK15T2hu-mH0<}qX6`M8c3)Lj{L9->H
zRmfrujb4j*l;xUFcP);k_k)?j4FqPRhnthSLKI5YWvu&XTT8m(XDW#>a(%K`4`n=p
zo-J2*=4=3z(U3{FTh|99p-|yQD7xB9NJ1@L;!1L0tQ=-xTdEv}5M`3-mku&Vb=Avn
zmNRTtyFZoPUj|0sO9SzY(k;6U@YOW4Z8K5Yg5J);aM8;g^+zGVt;?7pNYcrC#6z$0
z;*z`Pl^XnB1+T@sLCWQukU&ce-*L$3^;+OS(w_JUC^xDvTKTEt*);YC%nj{BfYe;~
zDx9O=vJGskmhEU_8J7YOLq=>;aG<A+QQPtR+EN1N*T5o_+)+)zzWx5@{m-*}l!6b7
z5kwyAPDup&w8A?|BN=7Y+NG=1Qlld-t<0}14{xOb0!W+l?vS4}4AH0b;zda?oQQz}
z{qtUgft+w%(0Ny@EH9cfhaL7JQ;g)eqlJ|(({V+Z>zn77E3bXO>?FyEjh&wDi=T#k
zkw3KME*!s8{zospDCLCjz&%`pf`C8<|1@+qw{u}){C%uU9G8V=!3a1Ll=wzb?9ikk
zUbti+exOh&*poJPTI2wfF8bNksUohgjoVAKf3|IEF`rpLut-;A7p!HXGZ*w_p;vpI
z=Axn9a0MX+u_KVwQ++E$$1WopM2048PI=2|p-Cc0w}v&SM^k?~B~m3SZb)k{jaCv;
znuK|c>`JZW=EpDtfCEU%H*rK2SGMlhc|_d_HMvR~1qdG|IAW@$E1I$3aTlV=IAL9j
z$?Q+Yl_j)h;kG@rD_iN6Uoaj7iz%rYc#>bkjd#2m;I?ewFP>`ImfB+=CmYxDp0IFR
z3s&)WoFqDCu1!7DZH!;^Ll~LT<2?Ikfg-|?)vUoM`xLl9nBW4L0Zf&h0gf(ArT}O2
z-=}(Z;;3RjGls;iuf(i^X5<wHf`kk$<YAPoG<M)(>kT`Hg578i=lJ?|p}lS%ojcJw
z-|&^A;3`*RGqh=FVkmL=r?Wt_C}bU+mt_;&mB$<jTAs?}kiN5?hnY)COJ39OJ<%%o
zC`dIq&@!1U`ttJ;ykrA)9PG1U6QtQKszqN8MAa}AmclfJ6RF$CP)T;NX?_I;5=ADF
zCDvn>HWv{=WMP(kfs9NUFqWS%fSFk=M+|n}w1aWSuhLc8I#|<V!C!7dU3RT$doaCZ
z@dv)wMGZNY8Z=Vf%QB~~fny~+aleN}L5E}$R!LX&XwhWdh&pj`c3Piwht@AgX%0t0
zOL_NY>ZGJXIOq$)ik|x#VBl-*@a*i?3WEzpfRL+M3tJ7=bJHj{RjM&CYxN7-u!n$U
zIAZqa+v^K!mdwvG*u^4QFfwh@)AU{A`3Lp>uTn4X776L@xGrGc9$KOq(9C#<y*%|@
z68y6-*UQUzN8rBL!9YM@{Jk&6j*fq{MSav^nHi(wkq$=RHOd-ZMCEN_(7ec$*eZTa
zVpj9)_Ybn4U|LzWfPVK8I6PrYph&)YzQ=~|j|(Ce@C}Mq8aSi(`QMbCR*@R3qlWjE
z$M-+Dz?8w3;%E+H>N^=uey^N^V?cCG*HW(f?(?DEWbA?n-qA&gZqoH9n($hKPf@#~
zqwkL=b76^N_u9LG^JL}5#kDjQW~Hcd=;vyya<7s}0S>8c(4kIBGAf!Rl<_kg%&1qn
zHT6=2B%T5Z0gF-R`Ir@q4YH3bsLt#cW}c<J9v=S;neY)2z2oH!%H06c4xVY@hA{94
zNYb%9`;>Ya0xzoJsGdZvu2d`HGWH~)TU%r}kB0;LS&TYnWST75#u(6hjhU+*xZ;%1
za#LT*mw+aBg9h`!C%X8AD^B}U3|{wrLQ+#YmVfPcz|}DqH%e0FCA2nV)aB0OUUm)E
zXJKo?r8Z1*zlkY9iQ3Vca@j4$3Q~cpAu95e_m3vT1azh}r&{~_Vhg(Zih8j)xg7YI
z?i8D$z*fjz02S<KTY<OAQtrB2#J5)X&lrps_AL7w_ih5nhaPa;z?OPMqSO;ayXTRZ
zQKG;WyY<v$!Evp1a=!2^JPjW^rBA5?%Q8Y=m83d5GMS#3h|Uo+@2?16=Rn8A|2#3O
zM8oQupdlav!Dq$q3)TbRY{mq>YTfL?OVsZ(<3w8@uuOvOw_Ys_98ojn8>yQwV^cVp
z*7#XuwUk2@R2v%w33>msdL7n(z`mK<_O{`a;^T%h_lggCU#o9>IrmRx`!wudBt(+X
z;;^6IVM*}b-M9hW1%-!9J{dE@D!}{LzyU0)8JlG&N80lHJ_ut_F3~2!eQl&g6cdxx
zHD;@LJ2m}n1ue2dG<`qjha;UEZd^Y1Wy*A-vZ+x!9X;nF4C+S;C9!g5>v0wAY_Z%(
zv~3o$(vpCYkB3Nl2ZvCa2kni#(DI=SdMp#iYsLzEX$trijneor^>N{PNf3c%Y8Uf5
zL|2uOoC@utxuSIj&Z{9SGsqmf!rDk`tZos@YEwpab|P^^Egp-NQB;$14~;{5R%ijm
zKPC?)aH`XZrD6)OShq;paEg^%x-CZw4Sko3LyL%4w!CUPr<^O%td6~`nzyR@gQ7Q-
zYBC5x@fJB!7CI)dP-W82dI+T#QP5-&+|D@f#x~q_N9@jYcl~jHa>=#tW7{X8?pdD)
zns|&3ig351@Dab7S{lkJ(6m)Lio|9%u1PKAysz?bD8rObX6{YT8>hLzvS~FGiOP2M
z7Z486aIdu9xc`yu#>Kw$wRZ<ZkQNUmeq+s?6T|qC>e|3DiHY4r2Y&$aO-qnAqYXAT
z7vs*=o}{S;E9=6>hDm5rKG&s6B42Pw2W>6&0bc=iE!rgf?cGC;S1g01?Q@Iz>&}W%
z>*8G4Vvov-dlfW&MzG|G2;#h^?1ezmpmJ{`CK3NCuhc8%SQHzFWftGoge32mxhSZL
zu5m^}7{!>m*~IB|o)%!BBX`A`xF_*<P)r$g{ixfnt}^6LuR5RGqd?NHQdjSYQt0fD
zQIm0;;S!+HM11nR%WI-FJelC1f!RH6b$-ttu&;a)i%Wc;Z6zg!Zc7lEgCGhR>M6dr
zs@ud<&s*FD6>@k}S5C=&!tK2|lctM%u^qLJ7RKNG={g+*u`nG(#T!v!wink$?-WZM
zW&2beJ*s_2NJr!NBKI@u)pm3uMPD5&#MlQ(w@#HU$hQO2IHLzS5%Jmpo3Ja&pMe!R
zhSZqZLi+|*6l1))2yff#2O2{IK}^w<tM42dqu*}tfhTKh@e5{o)Gm5jC{v;*dxrQ#
zJ<KEMzAy!X+qchwm~OM5CTEs01Lte`=G8Y3t!Z7I_eDJ8KJ^qGK{Pdig)I(@Ur*ek
z8$qiUd>>i^RSKBDG{zIdy<f9R;jxPV1&O2!+Z3x@)m0U2jz4G+5xwxIJ;VI-A_vw)
zq-%oj{yN|-DB)ih`Ja22zhCNq>|;9TM3nlN-@<LQcbhDkQ!Suk|6pz@LDU*NXmmAf
z9%|J8iJt5az%e5e-u`m2=<(}TP$$!#_w}>FHQEqTWfZwmJvr2%D~-Aa`Q&K$sLUIk
z!m`3xO=WU|nh{@j7Qc|N{AE+ysh{6aRhlAf2X0a@Mr{&ii`rm!a~$o-C7RiBtZ94-
za1umPVk3S{Mp(kL6-ej|Sg4?FTeVtmGSzeJ2lZxK*dih+sbfTE!c}BSE5K?GEV~`<
z4eYo+&!v0Qq)lnM$fu{Y@0|@@j11ugf&}(^&PzIOzwNi6J>Wx3Wdqjh<a|R^r2F72
za^kQ%f55*B4{@w?l+i+Ujq%$NlNbH)<KzSqn6k`#`|dIUuBoYHx7>D_Z@NJ)QaHh2
zoTN~)au+G49WmR2_#T6c6jQWKNdrDX8L18Rs==+nWjOH}-DE=orju|Vv)>Sg=Z<Ww
z3*w*sfS;{ScVU8nc%uOef&N!NTwJ~E%w7Jt<?8DvZAsSob$90%7(VUTCxk!@PdGp%
zx)U1vbmr8au_>_49RK)%XBB^9=ubf^Ost`*p($IISjX3wgR4xltCgLkZvOn7>)&B_
z1~ep&%X~gv<?8Q7?P}FK(|jx*KfP)P0H5Fb^4;&9dg`9$q%1!>JfCI?Eer!*?&DwY
z9~aKziiMhww)OAsS1;OA=h`3lXMTa&!^5`^y`Jm%U+iDo)<T(pKZy-eiq9we_rp_n
zH%FI`n#-R@k1iX$R_wTng@`M9dpecEcp_rzUazg|b$uS^mPfrkqJ&-@wE_K@x9!_k
zdmztuC*|L)3~$4W7y5yXf<*n6z@?mF9~}qx1>-P9<Gj@I!y2Jeq3jo2L3wAX3+2t>
z@ZdW`rsYSlD35jjwG;Hw;gg3|KkgsTLZagWCt7RZZJ@mmucSW_P>|f8(tq$3`KEp5
z;`GR4blax>{;e!3yysGgX=m<9T;}6>GU(DX<2~2tbu`eK!TXuVo9rs7e4ED2tIO->
zijcTyyniUuOV>(QdvnET@*s=9efx@E*GIjxu4j<ggN14;QNPprDiiuTFinEkS32i}
z^mU;6zT2d?dUTF5rT)oPHyntWL4ohuj@Ws=hTonl1oVnlZ};Up!6L1WOt`K=hy5CU
z`i6HhydFKgx=QG|#?Vhk$d5aesiQsKJQUCW`DMS~!XM%JdiB{g<Kt5eaH(URU!c<C
z>@+-;>okW;>FCwZ;0!2GmFuwZiZ#%c64Uat`$TbnIUcZexAR)7+bx+x(AB0V$$3xn
z(8K9{%R)i*a<{za&b5!sQ+hG@+8#kjd1E5uWx!PMwwpi`DPs$vE^S*D)wmpevjm+*
zmAFrX*rY;mK-@2a#y4UH=;q#aJ$zz4NQGpeejgb$qYlj9AWP1rZV}(5LLyQI5B1Us
zsnh_#zwOaK57@kW)xR5)UW6BPP+WF1hr*{VY9l{cf9~L?7a~rnp7fYn(M?cg?F(-n
zPFk!)n{xp5|EgK{tHL(bu`zi{UD)4sa;d;AEP05>Ja2tmFuqc!u^C9WybFHq(65@G
zd_0+XaJ+ST9+tVe5;{2(%u<nT)SsI@@87$+`gPjxlEqFJ+DiCUEB3X2&$Emu>xx<<
z7<uW(?j42yIq`y0W^-z^4a|=STe_&4G6`;B%ukTwM8<`zT|Lk3Jr`sI(JQao=DvZC
zxg{|Gt$3zv>yCuxsL%y%UU$65lb7>f_H<c9L(`bPIHWfS(;~j_4XI^fzLIe#37%P$
zYkXS~e^Munmz7qLE_tK+nXNp8eixgLRE}Xm79mlwt-Hd1osj8H^LxiI_IyPLOJ)J2
zirqzS^K&VFnr;;R%j2qbCtuasK;cqU4GY`4%%ZmaRuV=bC$ubrY9<l|Oz*z96X{`6
zK?r@Lj;d(FPgfdJ1j{w-9z%qyq5$j=FEwGLpB*$L$o6X$*#*)Fs$_I;f*cc@14y!V
z^!egHVHD<cl!tCm03da^NA!d-6fbcxLXtXQzcGkj;wA(obifwo{hqV}PvZVH3A0HK
zfE4*xd{rYp1tScT7xK40nKh`tuus?%!KozRIGR6Fb-`$%VBCMC{z(N>N_A#K>iJ&;
zXDVdi5u`s<q;$ZVefb{}{{xlaeOZMYlnRdeyD)bGupU(muvhGc9R?C)hX0btK!QN!
zg^cir>e(cZMvRW%QU4w)bP;QqaBuIzGRWZ-*PSEjBGPp8JG+PU*YVdSo^Ma?kwuaS
zOr`pqP}0)m*|qN9ilw(`??Nc9J-g@BzBD{)_d*{dz^Iq<bT+v92y$!HcNp{$dFo#n
zbN^zs=y|1gH#C!5z3Ex~YfeyUe|K*rfAcPOT}ZzfBu*UXu*J9}ocuBFjN=pj>!Txz
z7=zcO5eEHA>3j0qH|RP}SX#AHrd8;$wUhg!wI##mCAa=(+8zz7$6X(*a;0?nnUdmH
z8{EBy*_Rj+8mYIPd_B0+;rF?y9y=9BhfObEM<4UqvDQrQ_K#lfPS>2MWKN|!_c1mL
zj`WI}_@qC(zN@Z@r+2S5!%KI04sMT*DLB?B3EQP@?VMY_Xz-8BCIf&zPhV1Uce{G1
zSxumgG=(0?)_KlA4by7iZk8I(X9hjL44fA!?T@mqxe`To$b8RKl^xC8cmhA%fu#|f
zKU0uFB4Vwd@!}IsmOGjE&k}#5MP4+!G{AN~H_}zHP%RXCNzCZ~gpu)bmjFr{1{FV(
z&IYhWc5WYXUM$}LiofXt@BDmlKD(zyjrWYcrr5#78SFT~P553mz2k24T}FBGd^y{C
z5-&%9HaMMjm+EDAAHOSL5kz?wfD3CL#e|8o_$6*zIzE^*nmoI=C3CqGeyL0YB9A_j
z!7ZZFfyx>9rVHZHs^e<=-Wn#!or)zN<`TdE(M!wuUDeKqmc(&43WAwrJ2Cx{!!SA;
z-pGEpH}EqugpgFNm<;xQL-TW%ocpg$n<8XY@UhGSR<o$+&es=tF{#>d@3Xo^FnYwc
zA$kEHtJYi6fZOi<0Lh0t1(_=AwdkRbO!%hWQ!NArJNTOt6C@AgK2tQagX7_X4*87Y
zE+}{zj`~(XzeJN9T7GB~<Ed_ZgofM4C%K#cd?cqPZ<yiUpGx^~_r`>`&$&TfCiyBE
zrJgdwLqzeGB<>Mqa3$P-y|~0waYSJFy_;;eTwC<Isz!gM`}xc^Cry!%L-)GG%QHu*
zNAB76ulU{P-PmZ5`{VZI)9EwaYvnSic;<d?)2Pq$*~y>x#PBXR9{KtSlBkaB%Fz#@
ztIGqx6n`XkDM4~u-rnLQsj|8sh98xYqL3nV!lWPlmdn+vbVcB+b1K|=o__i5Jr&UZ
zJ|AM+w&AWk;?g>>n%=MMU7PwL1*%;#{-P*r%^U|bmnK6hd$GQaBEd+O``H*`=;A&O
zkC#tYwTaP@XCbSR03pO)&cgi+X)y9uvt(>W(yoNR1oz!3zl_9s-H?qCoq~9B>IJe(
zmH<Zj?gyQAgwZaB=8>^-RLaSgIY&Z87+8IA4+kfzz;am!iiU`G!YW4ROu{OFiEAv8
zPRsr7o|U|%rUHFw#KvPyj`bjWbUoHw<ATjv#{wSn^qhIkbPTM>C}}SDt7$z0@FJ(9
zL@G%6e!hHzzF!5>mK&vFb7L<k2FL3qKLfK4IP8HXkQ6keX))|CUJ6*fx|AH}SSXi;
zm9SkxBPro>x+kMG5Fg!#pHu5T7Mr|P@*iS|xtb*CbaZg|2r98o?L-N$>u}~I@W!g{
zAZKo1MP8UC<GX_S__ZYLM&ksBcZ&wbW^aqPahXh`THRDY<UWzzPR;O>Q(al@#Dmfe
zqTF@ZF3Qsz5rsA7k2dZ;Wru?n6;jlO8k(jXhWl;HC@f~Pgkj$}!jD~2t*}O=?%cwW
zh($Cp{E^HjvI6c^2lb|Biu??g&O288;^I=;l;-ER6$_@=a9wPtIXj0en;E#$&}2!)
z$R-WO$S>-(s6IX9_;IJPU!Pw!pSX|TJhtyNTFg0q4cZjy>h9tghCP~wrb3xu;v_<~
z)6gC-3BtmOq|HD>Yoa}F)3{K35}?w<8nNX)FrJMe;+}E>)~+4Y6jcd;VAMA`3y>6D
z$;(POctbQ;s`PN89XrNPE45u?+?@Oxb@b1B+%dK1SaS_>GO*}UQwid6SsG_mi)6|}
zvADDv+28Y+*hFr=Z8HFVtU~j#cs``?{`#$;iddMm48@+A6G93p;*@u7nGc_;BKkHA
z{5W_H?)Lrm=mKenhxScRspoJ=Ei%s)+VVS%l~SyUDj6WZklWw_vWifAbN`-v*>=$7
ziL9u2H3X_HqyeJGWSi)k|CW}DjUN|ra{j6Hx{X@)aStEyaH7`FdmD`h%=-*_qEdQE
zhSJ!XNGQ3kq_+b?$GiC@4jw+NMK|i54CYW>(=p2G7U(7hTB#s@v}tdmaDptE9kT`1
z!~5jr?fdRnJI2oqTP3qmWaM;lZAOC+%@#60Cm0&Dqs{o5k(+G2ld=mbhQFo|lIb|F
zILU1RT$9+h3uj}n$k`OzjE3AfGi83d=-N!uBnlMk*A#~lbS=VIf67wHZ9d#R_NJCG
zp{K`ZIVo;)IoF)WTyb)7i=jsemD)(Cg*QW3f_Bzx@(|n=1SIX$eGFk)Ooq)Xv^EzZ
zKKd!a{%W*<_nBD122~(fT(qCsn(R9xm)T_4AT?j%V6q1BaoY_xcZm<uB7sjOVu2uE
z<91iuh2Zd&;oO%>_<s7uidhA9s+kxuP0w)kbFQ+9G10w~j7;H6zfT%h-!NKjxZcSp
zniFZb2u7&v`o)wNiwarphR`*_3bT2%fqeJwW=>Gcyz5B4dBa-$382`+EO3LTdN!^0
z>IB(RTjy4zs;f=#?9D?<gRnI~wOo%Rv>b=f6Kk$%hZt2Ak_0-j^o;?{A76Z&tXXrk
zMA{&E<lvW14&IO#=G;&W?vc%~Gh!ZCIH}3HQt(RUl_VV`-$g(c$xg>IaVrUxJsH3j
zd=30)8FT+~-XNIO9UaGKs-8D4f@Hteluqf#CY*~i*m|sF$m<Xw*s&CvqTQ|acn7G7
z7s8$eIA<oBkPK2mQxwNAr{otZiUfrv=|$!`_5E_xnW4A??d_UaJRbJDWx2Wz$?v8<
zFWfudUK?L6XbAP&`VO5TgztAlll4&%T2|iSYWZ%r59Cq84UMY{5mymEH1H5AeQ`QV
zuks{aT{6&x=r&6ZOC;cH@1bq2E1ScbR_k%7uN)U}rg^2MO&xVv@E_Ip|8}swng2Ds
zft1H$m2sqOwcduXVpD%rYf*SsEFsJrW#;9)8MN8qT-)~~Z@PKJ(QtO<)~9d3Fe*m0
z7UB>5X=j>`_iRu}G=NuMZ{~XP(WjM&gxsM~Tj1<AyxY5)h?@P;J>|2DEa}f(WsaLb
zsti*O;dmCdsf?Zli-cJQRXFm2qSY0~4OFZ8_5gZOdovwThaq9C$s4HkNs*+dtR>6i
zurd;f>!F`e!@_{$Y{!g(=2Fx4rvAsO60rk?4LKHRsA|HLADMA39RXZ<;}=kM2%VqI
z5j)>OD?X8gP&F^HpeojBJw~l5;}k}$Y4d^J9mIAT8@gO`x)-N(8c~t7%N^MB=1xkf
ztpx{RdG6MPthr`w!*)R0$KfSN9K_Pg9gGt?Zas8ynFL|+ximH(psI;P9aMS_abk7W
zqOZ&ocE&<k^5)cH{iq~gfV}2>*D^7-B|yy`sD&nXpfJp1&_nQJ&l6H7WbIpB$XW~}
zmBEI>6Czd?cY3*>1fT>-RUAOZa?Qz#*}2po$(CJ@^#hgP1EQ1XYRP2tk_bDihc2Ar
zx@en_X#>@>yr9e+)0=GcjW-*7mi7z%H^vRxiS$lMK-zDZs<+-Q3gUpS6GphKpUVtP
zZV;WY+oea2td+=08_E4|yl0%^@Xp~HcnpMbzi`#jf8jFik=eM9OI=q^&t-dz0hfq8
zc8wceNb+HW=#?63hE}O%=&ZbeuPpKBQZ;}rVH!?o)~L{FBbAIs<SA+4dfOSadu9BJ
zYat8AFj6SDIgng$?Oonshr|V~E^x9=1gJVyO%v<u7~Y7nKw<bOgT$0Lp3(FUp-=N#
zK5|k=-62ghO|$VVa7{r?SWu}_yDh@rMRUCA%>i56*8n^uPtjKP@PkcNP`-ZM`@|D<
zWv1!{d3;bG!8o(ZnTepIesOh)R-IwU+PH<Vff9pVTZH%TbfW{d!#~qI|CMg=SGtta
z@APYoLyeRAFCIxm2@7gRxs3U1SsL|~bPqA!G#jB%Im%4qq~0y+oh7Mu;bwd|v!h4$
zpCvobha^M1H6D`=R#y;qK(nl+(Is4#b9STgf&~Uho?il}X60NWh!{1P4k?yNmlR<^
zm)@dZRhT+RvA-D1buoH#g$7j~ebH2VFW$*hZlU+}`W;BM2N{<Ci$R9iwPR_<0x0ps
z(XKei+pDu-x&&`cI$F8fMu@O8(>kytN7vO26SP;6Vb3ySIV%Urgnb_3ZH+e7|MgI!
z(~OUNS*Bpk|2?M?VFN)o3~XyE&IF8esYv9vwALK|l7g2oURp>e9$_y8@FNly{GFDc
zc7l?$ejcxNPiHUZK->%}eDn7vF-uPM^4$ki3;7GV1zfB`t-Lv^sr#A53i;9!*tQnp
z=x%7%1_YB$?W!DN<uNf(hwTfgBt|Tf_8)Pj)f!u0PnG!bL$hdyzvd*`FU#k5wY?SZ
zUT~VpcAQzr_~ocbWDMmtGe1dxS?}=h&Ze>|f8FL97X&O<?<fG;Ca8!=@NYU%n0|if
zS4HgKIoQL!azz9*>Plu;a+MZJjfc??)?Ecx4YJUls+@DYLzz|8qg$52Hs)VO_I>4b
zzOf3p)=(Jr+aah_6^Qk#>Y66L&!QC6lEO?_01E2Egf!<Ce=zFZn+0kbsVieDdwv|D
z{_1|Oe$C9&)+!;a-=|l3sA5UwOba!=gp7^0bT6{-0UrmF<}L#Tj@MZHH<j5_PrZQq
z?bxw~LUh2tc|MT*#p8jm)P{f{6dN8D0Gp}C1q=jR5(Zl;_^gPYpC3LlHO>Ha_-4&U
zBc2!C;>(9T6HMh{&pgMx4|!p?>+vJf^&2xy2+~bS4cYP{Lm)qZ>w6v_N3(~rA_?Kk
zSk2O$Awexe%miOWtlaVfgl~LV%wSi<iqaB2LF2z1Km3OcO|9^sY?Gx0|6gq4B16a>
zkwUF;Sprggei}1iN0R00N~XEx)d<pp#gUV9Dlj|Qb6Jc~eUmmtAB>!kxwGfA1!ya%
z@(x<+GH-s_S<`yIC0u>V3CIX(Ho<DZk3aePA<srh7<D_aFEA<qf6ZTrl59LkY54=m
zlH#;GE9|?`T-YT=yXL?dEe-~#L_mWqi2bG&-v=sj^4F$|8#bhV;D?Rfb5mNV57>~u
z4s8ByPW_<7o)*fz<G|R#)g?NhEu>j@PK=tr6k$>E|E#9C2<5r&Fu_MDMLe^WvHzuc
z;VLjsPR|WqKKJJxzDQ>^p6uuMyX`w}p)P?;KGmg99(4C;`Rg5x^-{sPYCZfA3-e#i
z17uI8RY{UPtVwG?!HFg%TU1(`(pTByzwn@8FO&>G@uPF8li|;L2nDA{`qUy9i&p%X
zQ3NAy%w6~cLOi~1T}(P8bMjNwUnD$t)*jlLolj6}QQ+Q}l_7l~?1>h+9!1%>ZnU8G
z5|b&9w8Sk6ar0!&`b<8GRW>qW{t2oUTRep|XA%zi)Hqoi3tk+_>@eUPq&yJ@3rW)l
zrieRPtve(M3%*Y}crf4IBWDRPw@r?P60Sl>QNhjJEuFPFy&YHVSV$biMmN~6eS3Ft
zG+=_X<cJxwZ>m_Jk8F<=C=HqI3VsX2^MFDE=N{7DKw`q8Z_I+7ej;w+7dkf98n%SR
z^f+enZ3@-V)r94NC~ZX<YUcQx0fUbG#f~;nbYL#A9DPv&>i&%h%cd(*!&DJQ1ok&<
zs87&oq9i85^pXY`wP{r~Z==TCKh!f~B=Dut_?T(s>Y(tnLB=W4ORi%PXPU5hXY-U$
zMAH!Y4V$u@L!U<sV%Kf5r;Y7pWyA$@h6)!3pk={)3(OKm#DF$oxeSUJbPXL;HfzF4
zcMA+y8F{nBCiU<)jXHC)QLLS43e#ZUlc$?t--v<(CQ(PWRMCO^HT9bafB0sQh5Hsr
zFAEnDcoO{ySDe0xjy~)|efGBsdqeQ&7Vb^f-hWfc#+C6~<wM?Ym4ZEBm1GEDm3$Kl
z`@t+!1Z`Go2;xX3%f!NHh0nyoD93N1twjb*+WJhWS(hWn+JuFxnk}1-Z2yK>w5dSB
z-1$?}cF=^@1JuuT#rI6Xurd?H*o@A9Obxn*Nr{CTg(CVC2Dea{w?o;A$eU_il5M}H
zz#8|-TT9`tw!dgnVs7brFt1O{LO}wgVVuqO&bgt<oUTZ}EnmBIJL7sLgewv#S10<A
zj1W*&Fq!9qv3ly!9Ji4;9m$m)i02&j=&eC$3rA(Oz}bpzA7Ed~)kE1NW0$mOv7nkb
zW<qzBfq}I?z$$p5uq_`(R6W3LtHR)<xRtdE_ilg(G}FntU-4q1)~V0)dvJiKJtdKx
z(iU1z5^Eov<fQm#63_okvSS34a-~vD$YdpPTY{+u%<PSV0b5zxLsS#iStCETlesM!
z$|%6|^~)r|+{M5)t-Y*dDM;-@DM*Zc^55gOG%_Z2oHg(HoD}J|vG&;~%nmMOv+8l@
z-hk(q>=Q(?)Pz>)TqeE;6VQW#2_jtn7;pScULQHqv3EbfQyZg{HN&0sWth%sLRVex
zw~}TNI48wt*Xl7Iy7;A<+4v=re=9^eVLmH?-paE2Q(?{jRG7M-C1e-(WYIVkjf83=
zHJ%`sBDTw$K!uj<Ad=%wF`h;<SwKsK<c^@(Sj79y1Z$f(==sz&=IQj06?*dZ^u_#s
z{dx15KBiB4i#J5Ne%m4)IHgo2Q7t3|FatWf@qcBU<WeFqXV**e3*-?PzbTvAW_Bzx
zjdvr)Rdilpuo>e?sJ!Ks;<I;Pi2%AVU;BptvTe-fSQv_hVv*7)cw)OzK1)<0fx!#e
z;XR>5Pqt}9Mn3&+j2Q72O+18t#h!ZF;`sVq+}jclcImApgN?_rvLc5(7?9?i5n^+y
zGj2)`0Sy*?Zzx5pMNGPO!v?7V4}I^Oe&fi5&~{Y0pE22j{coT_ad2`^Bv~g*nX>g-
z%i$Cy&1G<MEhn7yH85+oNk)i_^dq37?*sX$=oet>l(%Sdzp1tVpiTi(^K1XH`<?(b
zIn=;h{XtEp2c}-G#r%Uhg)v>*_5Y&2rBDQKx1TQeMg<OG<&*&$s40$_H)h6&9j55L
zZ+03fmh@lKwF!I0I-IqGD8l;RkRp2!q%^$a^9!SZJA99x^2aWFyk)h(;xF=QF)(>1
zFUB9_v0uRCoByY8s7a5Gr@fSak>kM0eNS!c(`spp%}>BBN)i;#{T}Q4>-qBG6m{&R
zw>W!B<KXyiK|)LOP33Sm?(i`O1vka2Wv{ccJis^lEBk3b9IEQnL~RDqv+d!Doko0j
zJx911j~XZk?njhOwYm3UNjt$>SJ{c*UE*9wRVmfniA*SgF(0V*xQSMta>zCiMZN?t
z=<6DCAhixWb$XA5#l22-<@;%pnwc{LOgI%boht(L%tLa^L?@+dc`|QGBR(~bb#9TQ
zoHE|S%P_Tuo5w^e#py#Wj45^uh4Q;T`A#mBsRpzk?16h|moA2S@5zMoiq|_`Q1Y$o
z_PB_6=VWHcw{xsAB-s^u+>30woH8f{G~8x)QH!llZmp-Yisq_G@~ufSG}q`xRnL+o
z*ejvg74nv1pH>z$;ooB%{PJL83AW0pxQ`82bem=$X`#v}!c@Sj$u7+RWEIIMX}ovy
znz7bB@;h=Zwr-@0u)>chO1?#FnDv2wqi_VFfMxYUA#~d~r-B0G{jjPxMUp{j3*7#q
zmd&ofdKy_|Ej|NdXIZvKP;5P|rXi)j=9n=LBvu-?)_q3$kO7x(z1ykI05T{OpWzjr
zfyp3+!hLF<HO6e3DOl*VK>f>O8<zs(-@r20Gb#Cgn^Y*auDs=#&n}v)`j<()pQ>jJ
z3V)j9srYSjSm(FNGyDHPll|7ZkA7<$#nvFO$qZ(N-*3qVPR`$Umi$<BT0x;>i`?c8
zK1fwqVZ@ts5JK<S6|{br$5dAMRk*BYTGW}R!hofFj&4Y~i7L}l<KYD<X>|BXEjaEw
z=3AtRsIas(lZhqKx5CPc{glB6T0i=S7>GK8Jqv`XmFKSSQ4nV{1+h4(VV*n%3FuoG
z$ZLN<iX*`g3-3T+K#8LukoCaI<imK02=pS_R8sN9U>gyMX$Sr?ufO0wALy)5XbA8w
zxA6>I_G6F6+gO!CWWx4IhuywUtCv|b2@@zW#TN!IlT_ekGMz{m1%U~Swgvke4cSKE
z2X71>7|k=Vp390er?t;up3*?LQ-h<U$38f95c!<bGXxLGSg08<8qu=3XdJ}{!5~07
za051i5W*lJ0&)|!g+!)<gsFtdQ>65rQuqbbm?;8NP4~$qKcETYrjGLh>aenG6V^ef
zh~6OJ*7QS%vRxmZyoC#zf@7L7x(x_E0m?=VIRW|?6nLA_hwLbl=qX~LtaKivs}+H^
z4VyNMVPNFF#Gi)FQ%<4tH*qM5U~xkvA$U=SVv4xXlg8oC5$|sDK*$g@zYjbK-~&qy
z2-!%a?g+0C76i*gZLf67Y!KjuvI%>WAMlCma%tVWwxePWgq+q~7KYctH(T!s*(H{J
zF6z|1>?-)yrEkqVOu$O(y)crM6t^&%SwF>p74LE9ui~AUUk%y~<%ImD@T~#0;XZ=l
z8n-nZkur29FQmetn8**g-2{vhRUT@HRP{JSDAJ$;*$;6x`$y0nR=&$$RgnV5-*yo^
zBMELh(h4+c7?E^P;fl-SN?IH`1!rs8m8_Kydy>@zTxe|H;@R$0CFmp32)tk_{Ay}Q
z=dCB@D_B!C+nNuO4=4zFU*O=^yzU4uy&uxlu6MY+8oBNku+GzK#=U=956j9QC?ay=
zRTdt4y<JL#Gan%3`f(|aPh<Ng<fyU6tqZkjA6B!+AVJDE{*LHEz&=LU7k!X%ZH3rL
z6Z_ZD-A_RImx+6I6(rA(HNmshZZqkIgLvMDv2qdjZUOfUa0P+=k;=&5pJ3o%IaEuD
zaHqMxdaG+^Voy1lx96+JE@SO2h+=)lO*3S^5Lm|o%9;S-0`yFP`s`3FLWJ$wYb%-2
z#UR{t!$;J_+4NjmS;%-~Ce+!g-#lK(v~r!WI4s?0%-z3y1&N(*>Xw!FsF<ls1|>i>
z330e{1%3G15pi_O)%i9nUCiBk%Q}s{Pm-j!rdr%ON-|@UFphF#)xnY^d*LRjCE)?X
zjBYG5-qB;#&zc^GV@fz*1BOGHpsMtbirBg@ErL?K*+fi$?~Q;-fk%bznHC>r16hRv
zMM%e#jI<%Hu-@VfJcSG=n*o!-rMqwZkyeAcXL&EuQ{`BflbDtZxsj4)h!-n7h8}(&
z+tFhQ?PH+}_;la6-k_d+Uz(47-eLq_YWN3LQ%#Dup7tM96TVH_EyoD&MbdM7^78zE
zbN!+>_Zsm?%llNkRmXzeWloQMPZ<D~<q1I{t~4Ye3Kk*g9tM_fM*P4u;wb%&^fUM-
zPhEgZuHn&1G3L!Z&2OS}KVYNns_UNC+Qzaf;9Oy>8hVWBt&)B?))&Ul_lPJ+HrxgR
z^o8A?ppqK*t^Dop?&#I{%~WI6p%$C=zWe*|XvHuS-v8Aq&zD)#)V4J1PKmvR7ZIqx
z>c=O2qRW+hpL})}3^>Z<6uXYeSJ2s}Jk73WjNpw#Mafq(5fh{Sy!fpKhJvlhF7eH~
zb}HAh(0)u#9_9v`aH;%xRdN!_u#VlgZ?5=fBdD=Z=$+5`-a{T_=_)N~xshd@egl=;
zVpcu??HP5qq{@uvFs>vg{2v~KHT+-}ym@hTFckDP6n-%tu=;JOxP7RMYapGp$?h$P
zsl=fdy>>K`P&#NJ(tB<oqP<WF%6nrtEazL96<L5h%_VeD`YoLK>%L;nw;_n+`)d$T
zt*B#-TL|Ytp5gk-zOY|sfUIB1P?1S)=}deXfwg<ayL-(eyDpzyL3Ls+y1QiJX^V2=
zX-QZiSxoiwBhTLI<bofry^nkF0ys|?l%SPBv8A_pLAhOPfbX9d2=<GEa!X-_t_e%3
z@sxYf=N8q^+1wsfPJ0R$NK<z7D(ml0ah{a@K2~mM)p}?>+bPoCsYK-k87?3L?Xv<(
zk=K3;-Uw$d-JeJw9fH7l`j6lzd4{_`1tS5$g26m9@y@H-)%DRM&AOIXt8{l?9$IvJ
ztDmGT^q!?H{slaY46G#dt@sOg`YV}BhZQGW$wqPYZ6@n3Xz0Fl!_uY~T>$>@2v+@E
zjDr*BNj@8>zA9Z$cJvQm0J-LQdsd7htPt&;b=rrLjbYEuqy5#RpxbeDZ`wQK2A0tr
z+U!_J+Phyz&qeZHolZd7yL8=ivB|&(Z%DA~SOr)iwPNC3A|M%Hr~YV&{!|#Y8|^_8
z?7c*=O7q-h5DCbL^8}U#0GEV#8w7_84BY3-0x)Y2XRGxpRp%845%(1m#S~Zd?+Jk;
z*>C=8=KofkI)ZULwh2)cTn})4;r+^wV1?9)Q)%wJM*CBVIfvE0e8=l_#OsElKaPbT
z$G%Bi()OYR)Nn5LLQRsqjPRV_Cs^;wqT%nf8PLl`ij8G0vhTcM54H=1<fSM(RPFKf
z7G8bRSzPVsX&_iBf<YnXb%Y4rMCpWD_LXPva!*=KNu8*q%;+i@+6a}JOFcf$q3n%t
z=tHF}!j@u}A@Sv>+ue4MoD~)f6+)boBx9f>)TTbnW{N31qt7SE-g8c)mXo+38lOx&
z$O~MkLP(!$_$aQgV+H|EAI)idF19|+!E&PsR%L8>=5Dl{Bf7&3BgT1P#<i#HYI9OS
zAjCC0<^+MC_>(zc1Z@v37YI3a(3syq#ck87LB`>sf2QwoIyX*)8GeKr4xLOT<yLg;
zh<04LSp72SPzR`Y=Iy=6KxG9#>t}F|LGHb%TKq47cpq9YfQr(e0PWpagjC#X65I~(
zH;w}@Lo>#^u0h)z1ZLRWGZ*qJBrwCV%~Jjwo@(EZVF`ydil7LYh=!B{wl{*`BE;Py
z#v$KYQ2a{)ehlSl8Wsf1u%0%bU>5gGYl{d%904HyrRSpP^Aa+5T)8|fw}D;qM{qXh
z$e;9l#h8E63%=*uhGYVp=RzO$Th>Qe<X|fLAL;M7iH6ULn>{y}A+dkl?Sk4z3G%d`
zg5-63ws?dS3oF<^SR;ngx%p3?B+KW}HtN*TcV59&Dr@lAz@qE2&IJ$C+lNI>-(}9A
zXjmVI2N}91mft&Ej}{{QvfUr5>@T&xR-mw0J`v0TP1WmbKYNqH#4R}(v|2$Kg?tZ3
z=lH>A&zn5YaTF46F{?+sp_+@rh{xt_-^xPN(-PxCAm40{45G58M*ZNmg_C(F<%sVZ
zxG(e>gE}&Kc&d6M7V1#8-7SQ!Wr6`=Bb3~)QX~;-bIZM942uaglrqpGZ)DUO_c>hz
zhdZsO1_GConf8J?+ba!E8`8(0j)ld<<h{Ace4kFtQZ{cybg6S~ma~gm->!3~T?HgG
z7V*pozEzB!pbow7Cm;LVr17Dvo~uwmvUWN@9UUj<ZY`^)znY0TWeqEMy{wSO4b6|0
zt&lg>#Fh7*+;{6`uy8jqANbDO^L@zF*0^?|hxw7}4PBCSHP7xI)RA+mf#!TbU}W+y
zGgNlXz4*N(9-jhITT#2DWUpR3ep9ACi&ffMRxeG>W5B4X`8j9v!1n|lohlCv3n=Hj
z3i^dyPPUS9*-oo|DDCu~l7a887Ul&BskKM;OJPCTrshz!aupQ?6?QxEQk;*z!WIMy
zCgyK`aTKqte0FU8$V6w`(*w?_gTtr0`naAG?Yd~6o6NvJnHjlm!$YTL$vl|S!w(02
zh9Zvwm)?u8;o${G22n|ZB8gl3B~}(i(h|<}_bx!eJeA)yuJ;?Ow+ZvAG4n)UPJ2hy
zFYbi0ZWPMDsDiD*R7St4f_3O|q2v9!-u<@OI~IKZ!xT$wzH<6j_Oha4@m<F7_YKti
z$Yj#`x{nLZV=z0SV88V)7C$iQK>4W1OTiif;EKfRq@Te$^yGxpRdVwUe}^$i)z*>H
zN#}64G<BCqSqezeP8atcr{UtF_{g}>0bdS;#-al^($lBr<K)#+W}*(3Nx4sr<k~mO
zB>nkHa6>H32jMpPHQ~;%6f02DlMx-?cxh5X;QgbccY)jWRAa|nR^ZCkX&{-HuNs@X
zc<i7p{)k=*Py0LL=y!Z(=33V5iol`oL9P2^OW7>iSSFbj^l3~MSuWE$&EQicBw2!r
z^`cb!iS{#4-`ySS`S2Q2A(rd8%595Aplieiwo}2+$D+O%YcFMOFyEpgtNjbg7R@nl
z_gd@+=RMkP-gypEa{7p|4^hJG;0L}!mdo^TQ`JUAbr<MOpLBY#Un15@^xX@T<&{@^
zxcySdI(%3HLx2GxdwkPx<Vn(mnFEc28YBdkk-6VPx<Vs?x4p1p7kZUgw4Yt)<i%h;
z<rIRDIs=dx8f3^zgp~vHphRHrp$&pD=xe$Yjdsg8zla)nxd0e}<!+*(iJAcG5{R=a
zN#!rZ%3to+^!<X7j0f(pBK>k3#X?t5&GST!ZCOQNgOJ*UVHa*YKRH&}VA0;y8y(cX
zdwZpM6oNz{kUXsnxuzVMCn}UDdZpm}0EaX5;<_o#-dg^icE^Dl8VAYmEg~(3`VSad
z&p@ut<IkqMWtw05z9Dlz(xf%(QmIp0Uw;ZiI%gz?!2z$Vx(6#ixJodzO?`R<zlOAm
zcDa`%w12r&AcVm|pgn-XsfegR-Y?T!pdIrSsu=_{DwZhMJ@TXCZ0JoQ(q2Q+Hq>Od
z0F-;(Dc553xy+lmOaD@#?}_lI3YeTTN8Q*uXN_`O9De`r)*IhDG+ay0tNZsN(J{BQ
zzI^MUG*h1cT-0?+gYKoW8bVI%6D}N3Xy=S&`04E{!WxGWR@(|T6CSm<pwlxdPY5wb
zvEo_*Wr8)M(8cUprf9B|gzXdaJMr$I7x;s(0dQ~_0Bj`KGA=R6Gu3qznZ<w_k?cib
z`*vlY1Eb%!C-<a`iLKy=Tph9yqxJ$L+O0JG9t2c=dVJZm*nQG>csG5VPT;>$X+MM}
zb|zRCLaZapNCzf}W=o6MH!J%{_Y#j&UDclZc|g(3zK-FOhJxRXa}waPG*{urV&i*i
zcbFCgFL@3b7>!@{aen$2P~{&$NdE?^2LpY1F+Bx|A%TSIC8dpz6&dt)jN1eLK-kt1
z0N-1Dj5DPnL7F(9L`M|Av8x80g^(8tOMlTU%xlmp5hzrCb((%44gKw-kQpiKS6U}Q
z=+btdRpF1G{;l`#v>$&RcoXeZMPSQviWYr>@+BiDg%w3|`vBWmDS86Vi|3w0|I-1i
z54Tg%arIzYrb>(?61Wm;yrDmpj_0l%jU@_|>(E(au{Rz`5NR+}OR#84POxpjdH%;a
zK(GVLc?6ofruKwij>hb&RvX5c=p7Z0*fIj>9UY{hqup_@cmDtRKK}&#FS|xwK{GrY
z5D+faa1i+a{61gH+SN+J!ou9t_0LcB{q$1{B<oHd?{OLr2bFSCAV^0>{U+W(4H=AX
zU!>pM36_`7>St6X9?H@-gTK}n&JB3;EnfOt$O$lwM)+MqE1SzOTEyLp|BI(r*P~zS
zY9;5<?K;XU=iQz|D{zs&YCd<}|LV+e8$h+ba&i+5+TQZ2H|ToZo7_Fke7!$=eRw(Y
zXn)R~99`AFd<31gS68E-o#ozuKjB~NTo&MGFvM-|?yMkc%3R%>c>$njrg~&5<!Yjj
zdKhvo4|_})adSU)FP!Wdw=Po*?L~vmqF0|wTT@XDZJDCOuU_5Yf@LmmZTzgaZ{j$s
zN8Ow+XDlA;7ODIlgs8d~fnGk(oYq%vwT6Bl1*&iSi|p@HEr9<1uTA>?9yZ&{qke_m
z_l&K9XFz{1LBo#Hwj0la;#ZzmlTnwX;oOkS@aEKAZ~vA~kL6t#=l)F4sWr&^gllO&
zF-d7(l9rhJs>aYTJm+U#`10Vwj(-*E<;u(Ug%s1|FkiKU6Zgj3PYscyOr8#;fGb}r
zopZnBqYS%!pyttXH-HOJy)dt7Q~BKT;%rT)11J8IH&*3l%H4+RYSa26Lq0Dx{0#Wi
zv1W%yoSVca>;1$ivGeTzy0EHW$1mVO)VEtaxO@dnCE_7meyJ09&qu^d%>U~4#J5&)
z+umNi&ftz-K36^ROi|fg@kUH&Y`%TY&)K4N@x-!y?t>32?|ZpPp|5zX{`WP@mT2(h
z!*kE=S)f(J5shx9ob`3SiSg3=s_UIAljnP1=hl~-b>h0)be?M#mirfA?ek6l@z1{7
zGuBnd2eJi~$UrU4^-kOehubz!?e`957uMbK^G#-VYUPvV*z6OFLa!Iq&8}m(fJcQF
zMuU2fPh9ClG$@Cgt+yi|ZZ=!+DDdcoz;7|{>U^JSloUI<g|wB>ULJ=Ho65&Ibyn8b
zm5PgliBp)$i8bTnM?sH|oA-U7nM0@h*PN6i|Fe|%h3!`Y;=R|>!SL~AZ-3uBIe39H
zq5>hJl<HskSs&{R3j3d29?JKxsoG;|#Rvf9M-1=LvE_-;US8{0H(Lw7%L_2z{i^cg
zs%Ye-XxL*Wzyl0Z*bI}|Sc=caDa}r#m1HBT@Tm>D-(-f9sn*-KsO99Y3dfyUav^L#
zE}`b#0o{X%yK+~1Lcv1^8{nb0;32jfkdTiy;Y-*3rZfsich$}0=ueQnbZu77O?tZ_
z?(*RN$rk7r<1Q>$(P75@`Hj=0$BbA|=1a}GgX)c*>8KNeodi?0QuL~g79iGy8>pbO
zwz}S2KCJ6gkoa6V)W<X#9lkrWckk5N`RH~(Y*6+om&=1Vj<5vSgXq0mJhxx`tlt+-
z>_z`T0o`l9Q#|Kj2>7z|rG?%RwaO4`6(9LkTd-=)U_xN9_mgjGUz{3XSgke*4QaAJ
z=5wqge`K`@`Fr0Ss&{k*o1Tf0b(qa-5i7Jfj7^4jdt2HLO_N-cnJ2}~mZuu#^j!N)
z?sr6z4nGZJ94i@2TX}NzLe4DE(IdsaLz~adY4?0StfM!Ivhc%YmabPueoL-h3tKV5
zu==X|()BYNvH7m$jJ2cr26r_P4bz16^|5JbA;~OUwC&ER)3C~8lus~;5!Pq?q!21t
zspcz^Hrb*cYz3xW9n?IwhaON)e7Pz>B<NYI2ftU6-YC{lYH!0&osTt~rtE!sKlVx7
zm~n!;l*3T?n;OXtc|>ip6)5fsOb5vEI~}O`h^7E<xdA<haFITSfPVZ<+#cgtN92w3
z@1;!;Xlm-<37Vha36c}=L;`rCMzRIg@!t@H^9aol&lv>W6kET8{>uIx<T-8+Nmx|!
z8>Qhl{2wTP5BNLE|E}UKM>DKrTK-QcZuuz(NJ2R4W(c(45HJ|WKeYZ!(od*E2221r
z3=te=`>(JbfCKUdvqa->oK+}pl<D6%e-HSZ=Nicd*jc>~Yv8aq;IQA;&}^N{WggIO
zt}f#qLw8Ac^(ddJWs*7|fWEW-3_=Xeqaf(vBL-~E$w^wHlr7Y2qxyU2;aiyiy&yss
zpYr(0^6|~DB|-p{7~6HD*Zrr{@ZH_)?|rveF$6L~o}<7ucW(EGQ{GnJ*L~aRL)#mU
z%HeG;<YJFebi9{4&wGRuXP;lsE7@H*{MqemkG~kJ;G5dbqH?N;P_|ly+_sNUiDG8g
zmv#*l1(?m&$3X;5<<IfaW)&~3jGddH%Xq(EBl~j?qxU;Y*H6~9;~$;hM_h%fwVrZk
zP55G_KY`3!eV=(+yFEfpg(~Mq^=~#n{Uh<CN}2Z4err1c6Ehy4Q#no@L3{SOK=J#T
zv&mGx))moHvhOPkmFuZ~%WZc`gUyw{My3IJF5Ml#<qV<;|K*H0)6;mb(+lG1=Zz=&
zDxs)ico^cJe8)IvXL#WXTi)EOZ)bSk?Z=o2=4N9nnhEA)oB15giXj=fm-QZ7!PJw%
z>^(hLLVQ!og?9<^T_jELxiEVhVot&o*kFieor^o|z)`3lo8bNkV_ghLC<-jrLRC^C
z;};<HG?1~jBsuztx}<T6&bGl{7Iq~~mL=sEUT-Ow<x=Cy&m|Z&LwM{-I0H3T4Odo;
zSpr-S<Ygr;S+TV(Cn3@b_N>4h*zO8mJj00S6nUsd*z%i>)k1h}lf4p$9YDxYC-cn5
z>`m$-^RyETnhOmR*%X(wdiz7vOBZ#wRFYch5p6f}UU|Olt*ykf7*y4u7%Li1V7gjd
z>1gpyIEvcXy>en~8B&Ls)48V5l%PVd`UQJ9wgRZOsJ-89?n=>p>qJO!KQHs?d7C%e
z%TTGeYdeYmrS==DoAZ+I{nJytgka__@Jjc2X8U{HQ-hEJC!u^9<5SDh85a{Fg9AF?
zW|loYPUE6WY1nh473g!{vXW!+>8n$}m_tfpEkC8h7b3~@%ANgjo_)=w%ACoo3A{$w
zCrac);|3)j<<3#!W*%j>qne(r%kw5VN4E$}JSk?pOEdQg^!<n<ePb^A<EF9yN7!2c
z$FXEx!eX+RnVFd_W@s@pGlNAIGc(I#w3wNh87yX&#j<3<w%?nXo%j8_`$vbWLlNEQ
zo^!G~>t<GE-r@>pZvtnBDvlhRX?s&Q7!_nTz-M!O=Sl9*!*ah-goT4!A7GruQ1-w`
zmSB`zCt_<<SfC#~WP$|UEnAmy@XWXGRMzthf=2ikIusiQelFnOm3|q2m+30%*iZe|
zo$P`=ZPIf$z4FD=o8ET^(F3O+7X5s&&Qpg!*CXb}UKeqAz^VT8maFcI7+`bR<FvJt
z)Ymzo1&e;`B0DXYyZF=5=v=d*KB&Gx-HH169)DGa%yMNC0VIkyLH9w7Iq8wJ`hc~M
zeTT1b$z3|CL!+X!@x!FY&-2xkjHoHS8%qN=L(nhWv;n32Dg6sKja5!OtEz#i_QfW0
zLBRE14XbWYz~{dimSU@MBr+UnOi2z6PVGr{yiDnq&WNc>G7VmCCh}^9mu^`zy5TtE
z{0LF!**a)GdrxZ2Y>Te(xY~CKVE5nsFzaTrXfWc2eRiwQAMlFHG5<RL*}_d>o<T7!
zfB$Or7Ev_-Fot)wLWSumne*U}|1+NpJU0r@?Zvl%B%fm&$t_17sTOxvPOx7_AG=4O
zag=VqlIp;Np|c7yQkbB+<Fr<{0Pig6<hDzli~ZiCGU4>s^{Qb{YD2QKbAg!jNnfTJ
z<<-q$d+E@Ni}vWl^%rnCip!M5sv}jgqFqK80>N*Ht?jlKjmu^fHplrBVyROl9k$%*
zjms8&>vwnc#?D(gjLhGk?A>AyM&r`9(>@VQxmql3eAYMGK(9VkDm^5u%KTAE$YB?{
za$mNt9ry0zVOCrqs(_`rlv9}6(O^AJRAcEo&Zn}SlZuv;N%)|1HCI97cQ!X%+Nm8E
zt8+f*Ie%~crj829<9d3C+R)xO?q)jw5bCke)=_cv_Kdr4-mO^5Su}0V*JOQWz`N8q
z#<W-tJ-#he^KNoeyO|ud#dLa#3cGi!LJ+7VZ*YZ0^rg*>oyYOr=|{jRTerRYShL`?
ztxAdpV%VCa%R}+L?P&Y0@(6)PXBwbhRNu4wJjR1)Le@jYSEubUM@p4zisG?Zj{vex
zwDHo5yaq{67wl`zvj&W_rz)%P*VbTe5T_i;M6V5m9eG0dM{VcIyOE3AeVX@&^xqVb
zq61Bz;tLb4Xa$isZua_i1s}s$+k5>kV{Vo527H_pZC<+gHnS%b`!7!KfKOjPd$;L7
zfF<!vydV!|uCNLmy)%<GgcI{hJe*)B6q+m|xw2>O3psAQy*NFMjcooLX~j!=5<y1W
z_mYbqV_|`!%#<Y~feDxXOiX#;pMDRH9BO8UEH>Ooh@;|cD)&18-a84umf-4nFa9|*
zCN*q9nzS_Qrv7Ic6qZ-LyRMmP9a0yLW~DQ0vPOc@v*eKIP+VlqHrPc=R(;Kl!m+6a
zY$#i9jB;57H3dP<Dqb2K$mDx#HpK->3KK=!J>n4Lo+?wGNIIVe`Yh$FnM1yMsf*6b
z{o=x}QyM<Acs822H@QvGkXTXo#C?PD?-{#^V+c}bbuuFX&Pl;|Iq2s(ETT!jTQ5Ad
zy5cdvcg|Lo4q%$Tj2Sw6b=1e6m5)P+H)=fH%4^~E+sPh!D}N98T9u^|a&fNa`t@r5
z;hAyGHW%&8U@QNw&vGZl)$%>*tfk*q3hp{{tDC`GY<2TT8||~AsxM=WMfQ9<0K-#?
z)hriAa@fwnbi7@YBV|JH_}EKj(svTGi1t!Sf|W_huQP`}v*nbrbj|nBi|aJ;=K4qJ
zX+oo=o^)DCDU$f*xHo$iWnIO#1emb(%d1OR=)nRFRMzFbvyKPWSv;U_cnZgKBim;$
zi%eL6Qb*M$c;&Fd>L|NJsVZfO7w5+zlGx>I$^10US5u>l?w@A9lk4SvWi8&cBU+E&
zEJ_w@U%+-g{c_91^nV>5ehZv<-jN6UHW{nl+>4KY<7d|m1srLHEg7Uu6JVyL4}?Ul
z{rKcYua%StJ9P5{X@s<B(W}SY2@#>kmZ{1|+wRM62#Kc*#8J7#es-QHqvwk?AD0ql
zo5;HoAy^0L-HttWK7Jj<Y^0E*M<g!8CAgKWsxMt$-8?l2vyh$1TS7)u3_S-eiL!{j
zwC<NTJ4~JzIkW1Jy~mK((63ElJp*9r{Ij1y=?4|JTj%0n!Li?)H?Dq4E9ycwrusYT
zvVLQvzuxt1Tg7bixjimjx)a`!{?LmjAVNG^)AG8YCh2f88cFG0H$wJZwf*QPJ^{`t
zZ6B_$N%vJh8C}Jusi*8;g|d-l)O6il!CT~Md3|8Om<|*N$cn;g*%c{YAkr76INJF4
zHM791l&n<px(;wh!d149F70e+^20-1$y@oNg?g(b92MbRO)a{7U7_<{FYLKOIM!jF
z?_mC^!|CX-v!y7ij@N|V0X|Viv4Lk;YX{%ob;?l_3*6$#KuarC<2bIrtIic0W+Q7}
zE(rdqgml-CYa^>p&~1aqo6FJrVoNTF7+ZegMSQbs*OurCVFS;3)VeU$_=_P<Q?NHU
zd~=L&E7#l6k}rA)8D)*Ti{$Q>CNGd%4Ast-DO;TTd>78pmMg!>PpFkVz{f_`SnrcL
zK&z*$gDd-Tpew*z6j94=@h_>Vjt+lGeJWY$<c%JXW;eREX8MAD6;nJQw6hfrv5{rV
zxd3SAx^mo;2Y<A8bW{9&yyeknEAaIH&kn#Ib(i~P!<nO%3lj-Ez<d8wE7uyFQjV(w
z=}In=Oaa%5e1YGUJpZ>B{V_%@dQ!TEPcF?!Pf%9~Jf=-bvUPm5(c1l8pit|>#2Z=q
zcW2zuowez;-+?W$1#AiBt0{%=o89B3JlPsgCiPa!(bB){ThjV+<J$NeI{VtbIh3-a
zN<S@ew5Jg64fqWyy|;S&y7lbvlx@!O<_z1}2y9Ke1N%{L#hlVI^NkN7E~_PO<@D-%
z6@}M|@FrdUauc$BD2cs<3C(c8SNT;ccMlL49T1XBuN%`!Kuw{X27&>-x7Pkx(Gb-)
zlJW{ODneGH0V<n$D!Ltp+%7bIEj0ZMpHxc%o;<v1R{5@ze!hlzz5#q@SH#B4Vo+=b
zaM2~5?{d9456jf`4r$&tcZR_6E;`Ef@`;N{l!BlrtHT7~Gx=j^%W_eRdasTE4oznh
zWhcjT(F(7!Hdi)#me*WJi+OQND39R@zU8_OII}iXXhn;9h64Y2l@d8xSW9YG<&En0
z-oi}0*j~K~lIQqt6~}26#&zmU6`VsAtb@EjSW7JYCc-gDslMxOmDXvM5h6A(^pD47
zkm%Ha{Ie>x63H1a8}zoh-6}IC8-ACzU-xX}OzjtoqCea`Y~;OcAUh9LhWd4E`gM{N
zq^rqxJqBo|%CYo3Y}CDM(!oQg1Dd-yKnl$a%e-t-*0vh9aK$UWc-X8>;3^C<-jLSg
zSZ$e=64s9n=-AwMH1TiUW;g&T?pBRiXs#p(cJ0DnE=DnSa?o{gOoECY3$1VA`p>a4
zt9CVwA3SI|#E`mt?rgd+i#blg%JmI3txC)Bl8QufsFHT5@^A4WH2_ldu(^fZcPk2V
z;NInbDEQ<sCF6kL(mTsD+uQ<W*a#0#<8b+NeaR#x&7HgED%%V#sh3BT_dRvth6tBQ
z^>}vS)d4p%OxEg~Y+DT1`5U91_fxI*TPEVx@84-dew^z>=CH`~!)dU3*_Y~37h@-`
z^q)VH^+#V12kdis@GA*5hDfY7jLwvOPx9Ab3$&szBx&*=lT>x`IqvQD3dSLisw7f|
zVo${?ZdPwbS6?o5gijtw(Bn=CiLZkDrS61Zej3y02+?~_t`=nX>UF4=2!AsMtt9}#
z>!sZ8FHZ6Kb_O_gOJ2$$zO-Nxb)yok;H&F&(1U_VLtHE!XYh%CwtITGq+wR3eaKQV
z68mnu3IN52d1n9^p(_r+hu$Nd61nycHx;RnfPjdyF%P0I%p&Ylx37+`R80|H0+>Ny
z7DZfe51G~dJEwX<gzN=J-NImKWh2c+A<!wrQAkJiLlX-^lhMtdPut;P7zwE(Gj)C;
zlh*2?)s|N1S%vpz*Sl*&4!c({l8Vjlp-U|3PZ|cpAd`X+)H*Sos)mE~&ceg9Jg`TK
za+zYejuBk~!v8xYGU&fUp08Q}L%Lo36_WQcbg|CjUm>%hkV%EYdlBV(zbTIwlYqHc
z)7W5OuhTrlAPYySq%(;b&Y%y&+Z#cVi5cFI1tG3>q`K0Iff!msh30vkNh)yz)mD_+
zUs-+;fe^~W%v_)js{=c*3xxr3{w-Y1C$@7F3RC~`Sp;G@80J@0*Tke_!m`;>i_~w_
z2Av|$a^!$k(7vId1rQMk7?jm~SjQRcO*-|<Of_6mWVU>lwRu|R+ZsC<6jxKYyAYU*
zjQ;dKhCd$A`<l*zdaon`LSR7Pr+Sf96b87n#gd|`D!tAQW8JC5Kp?F1d19Qg>4dh-
z!$F8!hz{CCA;`9n`(Q}G&_p5hnX_hOt<~e;d;`U3Spt@eQ6pd*nUW;r@;uwLA^M@j
z8j^2tgu+2c!1&lfuW$;zhbr_M#%)kwGG^OstuU~kS2%-Vcv(YuY)~}WLRf-y()(ag
zG+#&az#pqAa^O%vurmWl%D(Z=(N9yE_GFF&&xZeQnk8)rkhTZn6#~+0Pf-W~5eUjq
zn7y0DCz;tptXGH|9LD+f$yZD(17r6nb|wr`WG>=X4@cbp$wx*M!kE&PQAZn6;=Kuh
zBrsOas2e%pvkEp_YY0v>a==g=Jg?zT?fG(KQ2%@!Y~wR4d<lK-);|sA`+sgQ00HT*
zkz1#S$6MgMZ~De7Fe~8JclpwOazz?I2LWB50*{QJ_v~vJ|4z4O-SbMO;``Q`3hC5m
z6dw*Xj+X?7FrJkZfi|v_^!rh0oUk9{C<K(2|K~u!fPZcu3eh_q;_s26fwY<RDV_eX
zyVP~>1_ftc_RdE3?pF4$PO0&*5LuYAyPa!;0A8Y>1{q{{7^uu@$s^GhuhEM%qb;Nn
z3wSB23~gy`&%|#dx<~aR(HMPltF+puq9QTHIb&5vgSq;T`>sbbHLrc&x#qtUK{xbu
z7C9F~_2`dfPpa3LDjI*+sHZyIoX`&TL;C1O`Ox>3VecMotes@<nZ*{}>hxfRua1jH
zNa$N|7+x6JxR}}q9@%amA!;|m7pM57|48lQd*_9s@AqRMHhm@cO$tfBheiK`pAdRK
z(O4C52u0C!N|%wecAEk(WBW_yv8dBmE9&}X_knBgVL~Bd7%KxaHRA{i*W_f~hZ)zy
zwq{$&!utA!r_*<GYHL#7<E_g3O)1->5-=8+_8G2byHh`SCHJwG?&!hdv%AOJu*X{f
zU#^18nQUtcUh-l|jVZu<LW7mMZkKK?>Gs8pq5nK=WjlO@Nst29z*Jk)Jk!83I^SV|
z7)C<lm|AvQ*R2GqCU@ZcZ(%Fj-77zVzhGLe>w3xEhpZJK(*?rE!Or(eS;U%c>>58@
z1#{sD`b#x?MdhsSM4k)=gR#89&>B;P@$<AMm>ME|KQjHE*JMkd1S_KqE8l#U&~6M`
z++$g*fpdrc|3qnDAsXH#&rGKGSS!4KnK~EucpG6A5z+rBK11&GP3nBVF%^LM{0PNS
zv5{BA`IW*&ndqS$JNdatVd&e&m$yXB-3K7v;CWguY77%A13lXm6Z`ONi%3GhkDHvw
z8_>1%tgei#>zDPXm#P|1yZ&Y)KP9uMuJ^?GE;D^WaZm$~x3vjNxL<&vzE;zqTRJZ4
za(RQXY}K!b7(uLuys8&t7@(<*{}(-Hh=-r_`~IoN!4F54tQq0_oLu|l;m+))P--jf
zVQ=@~6VLNNJpzq_hV>WaY4;uB{vh%P;<g)aQ2{MkYS8{qAQf6wO5jx};m5>2zxS7Y
zX#A6dIu13m$)selvN1O>(6FSJr9d@e7<~yqR0)i|1g8_X{7QfGWfy(WE}vgyEPm20
z>9LY;W`1v5o>=V51I4y%)ZcY^YvNW<k!$?=4FsJ!Pe`5st-|qCQN1--H?<4i8Rvj?
z)a*@ncL>`sP6k$lxdE#Ld6GDNt?ktfL^&tk@jjMajLl_`BHa7|1Uw_u5(*ugBGai`
zTSAK%vq0;SHQXF|ia7JZB?QaF|8vD4?A)Y&x(IuarTOvg3aFHuNdFw)Jo3~eN*ZzI
z5;(jmZUK&>VY210hQV5XWexlsy*G{++aHXVw*DxVUP6re5KD6=Ig(6a_6C>dbC4ol
zlHC&mt8nw9a9QN3Gf*g2ggJO>W*PQ<I)y96RH`sbpooIMF#aXtOh}adj|d>E|F?*k
zL9qteF8#j+u;jtc!P9jTny(kKo*_?NgwXt<<<96w;IB$Wj^9OwFgJ<s1k5o$_X?Ge
zWP(CUM3{G{Te{PH0rL41ABvqM(<o%6NFutld*qSg6>O3uGhz?D00-x9G0y*2F_s$a
zzpa|S(`01m3oBEy{npmzJJTer=efqJ(9Z46rL)Na@0!xq!TG`S;%~xA(}bv3Etf{e
zxVi4pu=R7Er_Nim%2&SQ{_SmIPcZ8esln7q+QBM=`D4w#Y4lq(U}XMI;WT8@K;Ysr
zW1#QMX(=+d48eR5k3rN(*|{#Xj*B3y(!vpUD+t4TtwKo^-QadzRP>5RNC+NGnn;`Y
zyXQ;IP747MjrYaMlGn%kR?O5Fv8K=QV7Z61kbVbE$!<Hsj-RkHB^*DwaVvavR0&t{
z>f5s=BF7?cLCFg^qGiRu@@=*y7j^>;mN>CL33Wne0oQLv)sKlQ0(gG^)OJL}B#xn>
zxYZw0qO>J~QMFOOhpebS{V41y?1=}8OI<^{uzO`OA%U8VC1e?VE8*0SytX!YD|wmU
ziAyA%bE}?aG@+s&iEEqJ8S+WKurmY~yUlEfBCj+1GUzt&mJ^I!QD3}dU~?ulhMbzo
ze-{4mp^zG%<vHJm?r`Zz>MXgXJzVD7t+J{j=ployCGH`EY3(!v9U<u9p)FA|R`MSe
zx3sJTf2;5W4K~-e4IhwJeSN=uRdiSWmkyP@W1x<Amw)ScAKR#BCH;Spa4XQt_8WS5
zQA}0V|N55|Xz72fcvSv3#A6xI3bg-+6~jgcAwvgH$6@tJx7I%7p5qfYYq&!z*4;+%
zEuVcs+^gc>=(DV6GbgQiWTG4G?AjnYdZssjq8m)Ey>V>U0XU_O)DkUuzv1#bfLdDv
zjNUwVaH?g3Y?VxV2a!$ea8KQpKk!&*Y$Rg{)!(pPLM7Km<Ie#<!>sgq4@T$w*3X*{
z#K@JTSYV3RPHEH-FVQ@yoPO_tBq~p!yJEjjW+0j!$-%H8n@iQZM>gP#ItPv?S{LDm
z2n|HcDhEi=x8-9b*Po-tti}r$1xOV_YQAB+HJIV1EH<W}6$c^ahu*YTa;*9AF)TGQ
z!z0^p6E7xe4&2?sn@wAUb0FJD-O5L~5Qx@mO(fcoPlRgzL9LUZXQ;twtUovYgIXD&
z`7hKV+?1lTzfkK!6Z+;e`R{WuZM3nW!a4Aft)(k>%hY!i)#>~-!Rtbclmod}Sc(vX
z=-VVRpu;(`VHhxM22sa#gy}jMm%|Zo264kVwEdUjl(=*%m1v*?%k_CrkZrJO12`~L
z+ZmSG1^>fU$zmkGvH`pf${0a9hRCVfL<*8b%Q*&294H1wn|3)oMWilJ-CeH|1&{TQ
zqDGMKM1}UvnuW@va4KD{vn0#R-92?Bj~b;znV6xV#t6{C+Nd_%w1Yg!7<<IaLX{-t
z0IHj@7M!M^rehp3=fah{VwRrMxcQ{|>a*k3Q5?piP{80Zn1&A~i&L03sxr*R6={gP
zgSeqGhVo0}_E8*DNZ^CGh%im$%Z`hjZqzneB0$qbl{M6|&#{%8)KYZ|sp$uC0l}$}
z9L9%KRe33TST=*9@Wu#(xD35#BV}0(%Vp?D!>UXKuHhWf`Yk0}&i|V%F1pQox87%U
zjR2V%Vw!h$d%djF?1Ei{c|TjVW1nm8FG1YU%;ko$oh;>PDhE?zgP4a^80xp77jMhK
zT>=%Kk)E3*UG#?fD;TRdztZ1t_>8EhB)h_D6}F4TkUU=`u6c<2yqbqrITa=fAU+*r
z!q59<C)sR#Yngoqr>PaH&dihKl+r>&Z)Enyfn~t|lk7FBqVuq*vQmj605h<7Q3`M6
z`}p1hV&UW_n8>o8Z)j4+P=co9N=C5(l(^Dp7W>4A&s6CMVoLjD<tjsGk*1y%Qc4N%
zsx00(@&q*=_|dY?M7&E;d8&Hw7%s{w-Ky~8$<|7BRN@%0^l-}JT2XnCFW;s8Y0a-i
zB``3Ku*(V(R9T`e_A_Qa_u1w}{^`GH)mKEvRKz2_sPiJ%CZx)Ycczb%5+`NtA4sW1
zF;XVS6vZimP*s)2zM-p;*HTMim@x#>p3RscRHXMI;i##mo8(1GC=YpJhc@s$K{N!=
z0(%0t3eB9D&k7hc^>snC=1he&+*p4ywVn$#ra)y#UDZkHlp+-tY84f!p?{(l!TuAK
z7%`NBXt9<u9YOV97U<RjElBu;`Nx8(j>x2E;``O8oiZ<!WG@Y+q3p$uNLnbfEj86T
zs;BCLcJa9{7!-f;-G}^xyx9?X>jJMjCso0X7g8Ah77A?>xF&?bA&|(cRyi$}RvL+(
zaUz7#kT!SVkOdXwG)8)&0i+C^3!aoV^8iv#Dv<milu5z<Ntru+#@tqaDASNrQ9LG&
zTmgeez#{)wq}e<#n^>%|2@j(rQFbOKRpYDOM6JgAGDR7!-@bCEYNMwx>i7~rwgt5F
zr4)m;bF|U{<lF&~g#$wA*@1A(XRg(dy`ps`9U*&6dvYc9-i(4MBg|P`sf`VTqU+WW
zEM#=9VVmmrJsSVguh)}EA2-gR8U(VVZt4gVGGJF|7r80ux^p0)#CSx4vdXk{C#<g-
zr}*QOuE^HJi^)`rF;oM2?B2;0s$}!+s?KHsk7o5WpV*nw1hiKDExSzxz>)2E6W*q!
z)+nQM(%<T!g7ILuUifKYxI~1`>wY5aLhU5fpc9A{CUQ;MrPyDL3ChrJIVL%IK8NQ1
z{0`m_qI3}xJa89@U^c8N#U1ZBewU6Y0bXOO<OCNjOuAGhbgrCX2l*pS+LUg#Q+G?-
z2)w|``WarR|5Qv}8;v5{12sfxbkiW-k$6`(bjxuB9~Chk47T*=j#ED|{I6$!NYvQR
zo_*33`1Y)fOuN^2U%~9^p8txOFt(XLd4LEDjSBFb9|aCHC}{3Cz)a#t>1b!$RdJR6
zC=G=JDkWD1Dg`JCfX6%XCNOMC!CIgz$ZTmFKVXsLOL3>=b8WDM1%np>De-vB)5nqU
ziS-?TV@h#9E}j#oIJS<pb_Ng3Ji&IK+xVh;lkGaHYC2KVRSNY71go6wD?oLpNkgXS
zsCI&xAg4KsX{RMPiv387cNEJ^OLQDWfRW+Oh9cf|Er^UtK#fHkV&TIcJY)Lf6r-p^
zs|k?4R6y{8%e7b`bL{Fj)+8iN;@wph$tczSUCRAiR7AmT4QTUEg{ye5VTHFDqFT@J
zsEFj*x280ICUV!J#>;xK$(?s$8W~eQN#=dc(B%f3nmFu7;Uv!d6gC4wo()jdU{mUk
z=wFR1-FSlpnl){9g79UMWEX@c-q8#3n<FN0%kvp%`gq5SatG<GPhn}%6SCZ|KDoqL
zfV;oLl9r2VC7}NmmN#R+{AUV1t#1H}RI)OLrCx<|h0t%-5M{{BwC_>0wnB~Yw7H;C
z+&N`l)&;?6l64xW#isvXpZPrcEph8b(d7gT1A<KyHm#Il!?OMr7bJZu&iI%DKNg=G
zz1*QBW8spTb8~^`_Jn7OpVG&)2~qfyvR7Ch0?w4^Z19FMotc<x-60p)TOl1stA?4b
z=a6(Eq*CGLAyr1HJiS#@ROWs{##~pmv=<hyx~3bcrXzd<+cHFXl?z)c^0A$Q7B5|T
zUFf7H+}!*!&HA8g1qknMzhU5QvA4k~jTEG*n%(tTIOlh?{6tdQxM#oPF^JFpR6+Md
zr44p@h)Ak<vDRJY@TF@%%+o~tckg3Q;>l(meI@T->Pblv!?}s~{KG|CND=e#hAGNd
zQCvXyL?`=%tkufab8;KPd5O<v_0r&6FrC6MDOLUU5MQ2BzxAUhK+7^mu7XpCxWmj0
z3*+B*Y=#Sftb_}sv-B+PBgG26?kW%B+&~W!A^YXA2mT5Iy$Pd6+>C!CA^2X#zlXGA
z5iZy$SA7KBPRr!?Hlko$#e6uk1&Uv9c9X?4>AU~0AOvplSI`^i1(wKPz$ot<t2e-h
z-qU`Xwrz$B0s{&D9f&_7b@~R{R96J}*)YCAqIO)r>q6>D4LxqA#pijFa6$iEl6OOh
zwoE8tW@C#G3Mr7NU*LO4=?(5vT7S6Jf|iC062BYrN}%|G6weZgd>U{ZX4D;Y?g0K@
zjpD~_;#)pK#2@fWl#|>4&o}>kdbaqND0WzqWxsCO=V6hUU*NxR_K+Boz5{QA$VB>i
zzVcVdko9K+>ANg{<M}Hd=MMTGFXTq3_mJ|?e~{o0_A%o-0zLaLDP#e^24MbnOz1WL
z`wcWz=FoGp$8iYnhZ@A+j$sQ|cN%i$HV85j+}xGX#=y8>-at=kc^}dLA-3wT@<$Rg
zK_O7K@=47HGn^y}U-0lhwfF@53X1!WlDS`^z)3IZJnormpkP7K*FG3$jJQ8TxSp@R
zglfeoMt@p^$7be_oEvC7M__C0A(fZ_<tW0vAtN3&Br14l+H>t8^>XYXnNKnS0b5sn
zpRJnK8-fr|*E$BQ4!H+B;(0a!*(3_72bLPKt~i(-{vO#`W<djfAzIA_?{zZ>CFUuu
z6p7Qk22C089G8?OiT7T%Wo-$Ny_UqR!DN(B_%;L$kt{QWzpooTbm!x>Q?JcN?`sIe
z?JOIR;6Opqv41FkvPRftesosK6ABc7EDcZt9Vj!X?Pa#cmeA_?b+5ctJ1i*B6aU4H
zVTXD^sd-T8ac_*-P@^*RJG=%?bv?af<z^VcOw89u?=;6$oc&cRi?s*Io8%k_h%4ur
zjYG~_kgzscaM-l9Zvm>VnFN5+wSWcSPl~hDQve%*zj@)pGz<;EmTk^G^>sDk>&Oj$
zi>bK(!Uj{R=UH3#`ArZgz~xKM5fG;I4@~LbFnWN}XXOckKi|{IKC(iFD5P=OYFpb4
z214TiT!`F_nQp@nJNoM;enjlDe_6F6{H-#QbN7D+`wS@MIiKsd3SF=bS(sE&#QGPV
zwTR7Cg4orcKAx31*Z`NXoFk<R`4Ki>=3c9->V*v`%pLYGR>ZsP>um9n02k@}JG|12
zyiuM%QV{$WCY2x&{{xiMXSo+As}8|E_2!(pxjUEz^=b$dsaM*DKo3Nl7wWfawjjpH
zIZ8as<^r$@n*q3|95b`rGCgI|dV$2?OWT<EZd`%u$IR%!vX6XurV`F)pf5C`(BK_2
zJI>i#0wB4k^l+uPryk*p$`*Vyp#{4L<S0JTtXl{exPIxD*)*9h+2))!VZ1(G(05Xa
zA5}HTfCStmE`~H}l>(>_ri|q_eD2JuTwGaP=2cqs`(T&wYATB)m17^=+eo>bwN2pi
z0%-@dSKsc57Jg0>@-4&R14tHUnIs--{rl1R1B-p2=A}3a)anm|P3=4;3Wun)gqz0g
z;5oT+D&pv6J!lZCj6^M<wl%hQXP|KkGuW);E(zc|SsDp?rxs&|GDWOje?0CA0`w1m
ziy@ISdc*ioXTI<I5W^MwcFEB&J-r?}JKZuee^mL{y{e;&C!CENi#r9^_hPFi_J@2{
zk%oKdcCJ!6At$|*rKTXA5~VxMP)8jGOaiE&nt~nFMHvFLjdQj4CcB=~bLOzTBCHVU
zw-L3l=lccx1U3qc=b?`BGDd}~a}=hvLBj2q$kQi}QD)fI3bQ7GowUS89K_)u{9#d1
zr*4L`h>{uNxb$xSHySF^stIbUKOw1rA$d;IoKY1Dm}$RTcyz#u@B}!69g4v_nA1}2
z*Rg$B{fWS4Ln6%6md&ZgLj!MR)Wt(&uf$^o?xF}dAYK`NoQD!}rK6goPKUUF5Tg$@
z3vxysZn%kl^Q1mc$jeU8&rV*%rdI(_%OHsG(CAubJ1bMO#gifE+k2)d0pL<ToU1xQ
zmd%pU5*{glhok)LdpRDQp`>sEfzg$r0B6$5Y)2-UM@1cKDm^8D#GJ<g=5|XpbvWeV
zf%6a5!y@LQkgxAPt>TA82$RA*2|>DikkEguI3&L(<Ju|bw~TXUMFRHmA5x^_urU&f
z%F1<eWO-$kFfn<K`A)b2;du1bDB;w|thfQdn=rt`-7Jh(2@1CoenZIZYe(`dOfU!7
zmG~GTDs)U99c}>E7x11+j~Yr|#F(AeS#9>f{&>vC=eu1@jzCUk=oleH=PFw!I}ouJ
z3Fh1xXd{*^NuGgvVKSIKL+tuu{AIG;OE|WzYU=XuS|!<v$t%Im%B{29hee9oWRkeA
z8@G%pb7TgWtSc{Nz;sap379Ujdcm+G9TpjkOQR3_PHwH-HRa`H!_ZM}L?wP+#2lB_
zNJg``m>Ir+!D)cOPfnV!Qb|er7bi|OuSS0!YAWY=8>&LdPat#O<_3@k!gJVShH7mU
z0p^8{=yi~c%VfelETTL#G}YQbN{gBye^U|@!$;Nth`Hk69Eufvg#{iwsj1$S0ejSs
zq?{UKJrTgOM3ZM%KlviRSrO1)kPY!ZN>mH`xdD~<pDZv_l7P+}iv7nKXH@8XtXh}+
zdhAq$acTTX3$|wGelK!rDmxk~!7BIPxNQfH5$$4$^4V|^!Rn*|6V}q@DkiI9DIL77
z@Havw34AVVS)bXyuK3u0`=aF$ihtNGU7+Cua@lQus(81Y&@9Zn<3#HzPO01yxK&23
z7BZvs4EP1SMbehjs!BoU=F8#xEAA;k_Z?c<C_g&{enFPzJ9c0yCL;vHrHL93vdZ5t
zJqcks4&*<#FcXs#nw1xr!zFIwKN2!2O5K_Y>JO3&6wybmc17YB1Te2ac_9CKEt#Po
zsDksv>;-?THvl!XqvniXScMH@-tj9kAG6ZqiKXW$E*k=M)^0^UXpCz@5dw%yuxeoy
zL<mh&qm5bZ;y54NQ@Y@d(u|fTD#fc194y8aI1YB@uL=PQwfN!Jz!2~LBH+2xaF9$2
z%l|I|l!s6}7&v22>^a_G;tF!)r6Crh2@6^CqQs@dr7{x20Jq?WeMjX)>9D4JO<{nf
zk#3@x|FhIo;zGM>hnBNUf(6z=qa(?v717<d!0gC29Fs51Cb8sq6Bd7ws<=b)&koF{
z#ew8m5(ea`S{`gE+7g%U(nlJn{cXq^{2x9kK!m>xDM<wf@{wN23TAWXkI9%_6?4`G
zwi8d6G##QdoO6XK&?U*+G3HwtnVdRKbi7fkTd9S_K-Wo0Yya%`uN_{MB*kH}i*03}
z7&Cj~<D=>+Gp{zLso+U3o!>gsCg<ir*i!TPUWu3pfPJlPq+IwSy)u7B?t`AC_PqYp
zH`-`5ErlFvKcQYs_z4ahM}<>BJ1<Vr2e$>RxUcf2n9@3U`K;J<5#Y2qSvsc|?-N!a
z*JB8m4ApFH4i8k)GrzIL9Ml)~R<tU!xIa${^Rpqm)}?gMPZ9q$L(;&s)=R=6fC?H_
zInkYLmVmyN)=LV+5Pu92m$7KC%@LO<G(?SVBf_Vjj|ViSiUU5cd|8Xm%BcE2Lth^Q
z#33E<;QUtCLufXIfu~tCEJ<pZ;AWLrPoGFCkE&_9D^fe!T`P5(jNaTZIvcm)|7J{g
z-q%Np0Nj0C>tHpPmr=nWK)iWgqh0HOAASJmL$@wK95@XEhN+b{Lp?!CbBhxo4m_Kq
zE)b%UrwEMrqOxa>gAz4*w2%-nt)!^5CrBlM)8Gv%@XSdKj~U|4b3i0hWaxp!84FF9
zAt=o9n*<n5Q@y=>`luo)eh5shR%H5mCVFMwv7k$O*6V><@+|!S5Gvx!_E353Ae+<$
z`D&hEcc1e3L@K#UZJg#|_+RR90{^HJMgHIFY~g|GxM8~d-_9`p17>>g;*#K6`Zth0
zCJyx7TC{xIze%yu-`@Hq<-C6US3#6N3$6+UB@BANK|qdS|Emg>i<zsdmA%DZ>#Um8
zH=TcQqjztY8n*75!!|p~?bDo+*KWyE+5$Hp0IB~9Vt3D#2%oR8O|~96mTI1Ud4X~J
zJl)gnGB~2Zl#Sq?hmc|{0LK(K2O>s8N6UFPw^C~RYQ)Z*8Xn0^V<^DukMp(t{c>M`
z!;rx`1Jnzd^6=27wMo4=DT1b@H=->b4kFN|oTu(rITsq>-lb}|_#jeKTT$ZD2o+M8
z^<rc6xptZKQa751p^4quE=QVn#YS7?X`kVq$4MutnBE4lpA&@&7yF1fWKG$ulk44}
zi=_1d;QkY%Zh!Lpa$qOg`AJ!$2J^V+7(O|b0y_w6Fhe0eQo!*wsBmQ!3%i4^&MD-`
zXg2$05`m8@0n;llg!08P8<J_EBPKo*9sWpk(%d5^`h}8$=wg9D_!ts9ONlR=8zd^>
zd;|3p3?5`Oj~AjW6hzMDLJTE-#vui8qXHXcw%=lr8#0HR(*=1U6AW`jjd?EOGWX5#
z{nlfh`V=3j?g8{mtAoq^@dW(Vt4gba^YhEN+L8z<o8_GGZ#P1S_05t}Td}abn87A>
z4rM)w7_DH|@-sbg`3k7(j{xEbl`1Pph3c`k&&;E>!`Z$;cuJcq!^-!o@V!i?*ZN?@
zx(72-QKdnF@Zskf&$)pdSg9-(>Z&&7KgB|lp1=1|j;ut{)w8}|i_4s=1?_Ntn<^ow
z!~a3n#}4Q}AWwuCb9I7)WeU>r@rH}2PCQ+@MQ4pkyf`z3;^Nx99gxn-wdiU^NF)~s
zF^xK~&UEWK<?UB#`Ayg)$difSVN*xkIhzcxZqEnEmdMX_D8KdqZDrcQ9y{+3+IRYd
z4Zxmo)cuUycpHh^6zt|=PFKLywG9x>P2RI^^?OhwRS^0pH>=Wit%<Nc-B;`9zM?O$
zMN0N=E05aHf!%PFRAi`_;oH0~cbE}qlIk&sCz2Z2l)xJNWpfn{J-CV|I_C2W31?f-
z-PMBkOJxX#QiYjM9qh5F1eXpTFI$uWM)&tA$+`$$dh6P<p-d#@c4MUuzWe&oiP-xR
zea=g>y0oFLi}?vgZ>>GEh$K6wSB~lni)>Lp*V`sjxvTt_?<0Q5ZW9~wPx5xf24kC0
zoR{WXsE`xx+hjy$bSH!$Y^%lB#&uH|0#x9ImP2Eo^sSMnvhM&{9NxcUst2;`z0UJV
zQ-8f(54UDS_#N(T0p}_F?!cK|jt>hMqLhN7<aT9EiywihCFZcEt7lkYSg8czkT5Ew
z^e?y;;VbU7UgbG`CtQe4|Cb)im2J>}F0VY%K2!k#mZNG0{{5+w`d7`lFJ{In|Mi29
zneDS4V1@^o_3rLJNL1N3jb;%^WW^@vNEU8Ja>S=#X#y^WntFL+)8fSLxThpJB`M$X
z!<}N>k<PcPXJulAQs2wFSTrU28MbLMObx>a6N8y~0hfHaUZh_E`GP1C9y3U4T+UiA
zeT=L-#50Byorh1bs);{`hhlt2Wgje<vk540A}!Om!l-_&rY4+^xoM&1j+N<W8*c!`
zTlTyXmPBEg8lQ`6-Y8%=fj(9OQ|cT!ahQQI^UJt>Co)aGm*NiPCv325RCem5UmiV4
z#?k5mV0N8wuv-Psa~je6Dq6-&tlxw<Jo`(+=d|{sxj7*qKo^r>+PojH+ucI`)3M(?
zc`hcvlA6EafJLMJ?U*^Rj;yMYvF%@VH8)fH?0+!h5C8lMEFqgMW1G~3jzT*e+BkuT
z=M92}UyHe(a4FO_|9hIxHjeqSR=VTfF5c^sZju}{5C+FpR0^Dg=VRvC_~ZVKK)dA&
zeLbZZEp6-1UR+1|esGo|p94Okm#&FW6PlQ3WXV9Z?F^K|jb<k81#{|F_0+bU@06c>
zyer){T=g)bVFxNGjr(J;;l^q?)0_`j=>;io%!oCkIS6LM&oZJwTI*f#Jj0`4L~-La
zNTslMdAqh#$XR0pzq7<FjoW#Jud9w4(TfWuv+9^D9gQ#_+TfjBVzb~%<%^Zsn6o`c
zD$}{}L7aw*HxC6KWuai9B9ZQ`50Ba#XsS<7XL9nGm%RH9Ly^i9t46b6MG6%vI3BF$
z9hh{Suk~Teir;&2(pr+3vu9S3XIm;En^Xkd$YdExr+ueCiHwcTECXGw?3<>YZP|5)
zyqqj|&<I>Ct<wt#;W-V`u^%phSIK29sBAH{?m~mVd+UsU)(_zI0Oh|EH9t_h1}oip
z_;CT_=GBEzQ(dmh#osz`UJ|L{1<RIe;_1eMmoBdEq<uTa&ET1T=Ll9dI;NxE>2}zw
zYnE;QZRqrLcYR?1>HOxdbcD&^LkWxXaUs#lgNqicoe6C|cgPfJAbu;s$jzi5L1-3a
z#ViK)0|NPT3rh!g0i~6i@Og1e`21)Nm1QlxWsbe{ETZLT58MyqcNdnArq&*V$(~#R
zdpY=s*O^;NROyJyaXe;EzW7b6+f>13{T)?9$AiAqJAqV{k;>XV2S_lcsk)zsUn8d7
zUemfM&r7}Jt0gMF-uzcZ(La~$WMR_QVPHME4q6ZpjDMe!u9jwYW=ww_S^lbIrY-NV
z&WYXqq=DRiu=uj>7TGR-v3j{&f`Hm08rLYMV@s_>d`jMpaOu?RZ?suE0C&+mP+KMR
z>0<sZ)1IIi@x%a|W-XFh$@yqV{s#s02<bDwPmh7}_0oA5A?WXLC%Lyr?L%cx1mf)y
z#K)1#Chws4p=I11uopz~Maj!1nmwD98ogGy8W=237^&8on%?F^I22u)lvai?k{&Ve
zgNZg2GeY#+gcw=GAn3$pvSRDD4SG-H-<a3=VLz5P=U~=h;L(fSF2O)t-U$<kx_7MU
z@ta`Rn+9@{r06n{G`PJ}o>hKLmUhZwuZ9)bjlB_gF$;zDgF!fxQ2kKnd2Ntkk(4G&
zKo_GgkehuN5X@l|MwZcBI$ia^zEgX7c$mntvZ}~rhrHIWS-Di3@1WY9Z7vq4dqxLt
zfjuuR9S=6d4w>jW5#zimm6TGFs?e}?>WX_#nW-IG@=alZmg^i(4Nd;D=mK@EyhY)<
zU8W^2PNunl{Ku_VgME<|rKN7*TbB<iM$@U#0x>Fc!GSr8tZaRw@?1^fqExoK6KQN2
z6Q*eaG?DUh+Iiz9Ecz<6F>W48BSrNmSVC4Ga^Y6EzQM%B?5VrsWKI8dc<2JVulPb*
z#<9!YU!V8a(`Nd<OET$yemPv)AE_aH(}n1PeRD#6Kl;t|?)yUX>-_nP!#AHMkkO>x
zZNHa`7K0vLe`M*w^P1eZo4vJj2Z4{fFe>p@1gJ3S4G{0}Aifij3q=K5gGLJ6dWw8-
ztUYqq0V0<ZQAozXhv4shS8^l);{J&91dh35#ya7xES_aFP3QFyQX{FMt%qK?#!<@i
z`IW06Ey>u@>+-KRU~Q}9u&V)%J&Kj~w836ULPxla8%5`&@+o$tg>bP6aS)0wu1NW$
zui()fAGStWN&rbP{@y%rr#cd759-O2tktC0Z<ynZqB0F9%U@^niyQ~!8*4;O+eMUH
zVM#Nw?0X~p_3PAU$)rc^s5eB!-AgBz@kc`7OIS$J@9_scNq#YAS<`MNHaQ8K+9i~i
zil+B^hs3YP&y$-bUAgI<Kn{5sZOaD=?0K`Q5U-XLXbMyu*LtPKhP>~Ex5p)D){ATo
zn5^JOTbuX8xnL-Dh02Y=6b!mokeGXAoqbsqRPVVn?T}r!O?d+}C#-A>y!G;KhBg`*
z)tCNW7ZGaz?ndKs9X7YfxjVQm!5T2*{lim%@28bTSt&!M%=e$BUMrDTes>+4b8a&_
zuIqS4BJM-UdF#3k{<@jNrxR<1bC&!EQA`B^bUoHTlA|>v@Y%2t!6b&LG+R3u;fv%j
z=$Z!7ZR!ULiSu-~av8qT8kF6)pS+=`rXpf-+&@Y35~1rXMl3IlABE%>+<jHqOHJ-<
zqg`BFrYe@WytOJ<iojNoij&z-_d57>X>f`XQ`9e)^Ky`(kSawZ-+*gH)qsk6P{u3|
z`OKd^;~cHwoK1+*J}W$;l~(Ip?A}~h#$28@`_x!a)k*!MKhU9UGjf>=({(Q>sJUsY
zjFaAQAkQx5fI!ms0j!NkqY5RlZoApGZ+_1D0&?Uj!Tz_KOF=1km92KF&0}^o+B{Tr
zl>F#sIJa&}Gvf_HNd@QGIK}rSHLE4<MeUR)wWyOZ<Xua4k!EHIPh;99H+B=+4p&m-
zb+;UFW^%M<@M5<!C0^e232-g9_g>S0@)=e&rt2fHRNtQtWiGoSQearqZ~(fVEiu=%
z=<q`Y@4md-ImdB{@AZ0@1V0!{>$BId?(Mj~6O2l7P?^B!=mW4JD!3aN#)D<eIzH*z
zK4o++q^l;YgVd+ffP;v<T?>l-E+6JequQIXl|4|CyVzkd)Z{eUWesi~n%w%ZUy)hz
z$b<Q`YMUe%GWkP9i<&vzBK#2rr_6Xu)xO@J&jGO3{(aX4jFY2sWqG&tqo)gXdHb+j
zQ6=9uF>J?cPL0+HSx1n;PGN5{SFy-Tu0;(GQv9li@Cck*H&5We6OJv5n69v!w+87X
zc9IX-K+V6B5=*CHo)8JKI7VBmYH`at>ggw`wc_weLBn$@DT)ox)3$A8PB1B5cvC3J
zBK-5$PAL_W5?>U3K-b`e6i58CSvoH$0+9?X*BlH$YVX*XKVCHA+hCBLx(q*3IopyX
zOq`dNbF>Ab)!?=;nSYL`C}~lF(c?n-#i8l3RQ>8Ki;W%6GEUg85DuG0nn|pdYsO(Y
ztvTX0Ig=q3R0dhBDK$AfVUD;lQ)sh6n?hRS99V9u#?b<W$#t*~En?cW)y2^DYwnnS
z@qWHxE#Jp(20g5E9QRv|-Y@8X=3?Lcc~?q-7d$9n_KF64H*s@ub+FU4vt_b!HM9Hc
zvR4&d2iMPx4)WwHK9)FE!W)jhpo~m~k`Grc{qh4Uu`{vA_+u-)XUF4<<H(Vn{5u#C
zJ1?L`T-wDE&ihgVK=gEXBy@)yavmZKon4J=sbV3i<<{x}NumIbMLRZzzOT!Ixd@>i
z3})e2j&M>H4*NFhh}{^7RxuP9Wvh|uP1(ebE6H?ZMbWe7r%r&ai1NZ%4aIGAS;g>e
zlOm+16)wxvIe2O9f9MhzKV0ep>Y)PaBK(K0zbnQ5N8Mj#Vq<3NfcZHx)J~TINuy<j
znFV#e<|8WDnusIG_xPFS@M5%fS*fk=0u^u^)-8gMCl@<Er0=c5lyP%qB^4?DCyw_{
zoM#rRe)cKE%}$MU63l0yMK4!{pZl!aOf7FB#Dqz4+L%ins>T?Q>DAJSQzwFXG#O34
zxz%48Tg+y)zt2DSkaSR~;7gF*8QB(L6T)uQIc0VR5wzR*k229VcA%PYhdrd%u}AhD
z;jBu{Ait~2tI3sjs0+TWyn+3*L7O_{<adFBje)|k{~`R3<iE<-|JABF@pbk=%;@4@
zy8CbU##r(oEYZWIKxk~1Wb_C;{3&e7*2Q8f-o52{IZQ^rRJ&d+@!<;|2HP7}&eBrN
zQIf-`e&@>%&F<{sU?&^pt>zW5g5+rnq8K6Nc6Hx&8mU@<VnigrB|OZSx;feCw3uur
zld|XXf;t?~#~aHLoa%Mmj>4qfWJFCial)po)a#V>9fc+Pi3wa)>inw4<&<frBy#RD
zKg+sybt0Rt&ThA-r<JRq)Wp!^ghJd1c)9cm*{VK>sM;C||3K`FJSV?5{Kdb;;>})P
zIu8BoE|v7x{}`pk&xS?V0Ow>PfZ5%D#vJx;cE)DTz*(6;+1-@pru`l_cGq^PP=bVR
zQ$!Lw1bbtFkh)%s1X<G^2qkvyS?(wc^Tw6zrO6asWsYE3B%2(n`$P<y-!{*?58xH@
z*Xwmq;uc~Ax(nx3kWT5gDE*P_<nsReQ`!mT#gj7Z8Y=4aY^$G;)yM98+{|y3^?68C
z+L}^oP2L|=H>*FSGNNtyVz|Ql?^E|Quaj#I>?X^nH0bcjyk~~y-3fL~H6nIcWK@RO
zLTsT`zULLbvU0-h-_E;bD{h*oJ9XBCQ-zoBM1F!RZpW|*L9SUpkZ__@+=HalBxKi6
zoVyraR<Bj+b5dE#XCYJTwUm_3m^^pB6N$<f3zv<YBw-_TpfP5po52h(3*?dI;^sM9
zPL_w{qL97}Rp9JU@ZX-k&+zx=z=vg`@yiIJ9;2(VfS87iwTy&wl7@9yp{L{d`s;D3
zhzH}#CSY9<a+^|;b0hXLqDx)30__!1V|VU!dER}9-@&i|5jjRu!Dbjz7g;SqxwAA^
z>Ri8xs#U7kzwWiYxQpwH!bIJnDGmLQEXKMpYjeQ`Te^00>?}jVuTMWnZk?qRGYYXN
zsk5dR$mOMQvb}mU6uxrB#pfUzawJEUQ_`Adz~9m@iI8c_Gb%;XkYn_HdQln@2w1m%
zuK}*WwWPQ^J;)k`Q~pgq%qm%BRDK>zHza(*&ATmNZd^h!D<-bFX-BMc{_fx@obl~u
zeeHJ5qu`in2#T`KB4+5@ZveNe?uTg9<?klrbH^v+4$8XYn4wt006h3EX-Q+MnOmWX
zl%5^cwCZkYkCpp=Sevfd-K(x^IeoF1u4|EP=srCSz2SopZ*e9l`y343A6;nOeG$m)
z?qKk}rf2fnjd??<$mBD$Z+Aca-ail<i6V@{xs%G(Ib!NwFmHVFJ^@%y#H?P-!Pv@k
z>o8O!f!S_|(-+G9jibxSEQNwKpC2(!KNV(8Fh6fk@!mG+as7trhN3j+I;wJlI#Oak
zKM~6G`MizOanqz`@d{IoC|4aCqnE>z)cbYv_bP^ghlbBK!B^hnidgrL?oaX(@OXH;
zh|r*l`npdZdN;03x4VM8dh~2bIq&vf`?hU0wRt8MhSfNji@p487u8UYS0j%oM=QNo
zleNMK^vnaDxU0SJE;fewI8fGxqHL2qA!}40`CA{K{@G)V(-9m+1Ft<*IR7>FvNJQa
zGGel_GqNyaVRW>&xYYd0pS-p%`1F~JtuWoYOr4f(RL2Wv?h_8B7?a6r<2s*=k(4H?
z@kWxC5=lpLOYJ(^8irR3Y?4%@<;Y%XN(5?2XL5w4$8_W4-K%fz@!jj)YuCr-arU|H
z%2WHaTlKm3l^EBzhW5-@To9qJ6XqkAAo1xSc^%Xhj+At-rpzFp2Aa9;H&7Dps?QC|
zXSmLNKHM?}MzTT!I&HgVMI)lTZC`J{cAHb;h879b75M#n&iK5<qI*qqmBpt5rvu?%
zoEtZGETo6@zzu@@0|rir0TSB(RIFFa1s$9?08!p~8jSiYav_L6-YeP5&jQN%>&R;5
z0Nzon*n*`IRk;dB^@pi+SA;mhk3!XuRz}iN&%!d9{f0TDhY<_qx%c5OyBKV_k|y=h
z0H;lWGR>WU^|?G>S!1$T_2@Z8#(k)U1NFOaN41S8sGyD4OAHl(ozZxu^BGI@7b$2v
ztsQ_)Zs`WbWe+F2%#vM>?PflFU{Y_XBct=$B$cAAsiVGOZn0OlgibHnCoyg+GQtb*
zc^TRO?r6b~w|dU|>a(^1mK)HF;MUl45OtTeXIzQNom>YD!a8br2#0Z3f>m8Ww{HL;
zcQ(#Y#IAEu*kil<^L4xMZIjjSC`UU!-K_EOJy$0lxf-JdLA>Xvp=CSo8X?nEV_CUz
z_tNK_0%ftNwbM0qENQejswJ=~lwDDn!@E9Vaz_r*$!&dXb0rTspW@b9vtrDvw7$|>
z*B#C>kz{*sy7h`g^uzDSYa++dZHU3pVtcR&dEHpvna(fq<E%)(HNyw<h2qG92t7oQ
zlpR{fzk2M8tuqcQ2un<2(!qWc>=8;Ut_Vr%Kq!HH*en9BX5U9b%A+3BCAfJa6!Rjf
zU`x}(5a`6&TiImO*CnHz5DSz|>$9_Ix~AK6fP;+^Avtc;4GJMf4kKrS)9elTAqc0S
zjXv!3%kI3WTy=c($tQCCl+;;^A(UbLeem9l%%J8X(8pekAXJ=P)0iMmM;dLM<LOwO
z2@m>S0`<_v{c~@vo?EJ>m>ykuAyoBce8waD?SmE<e|%R4#OQQ1G-SIm5?P6kQt7#m
z!IlQJ?T_eUbOzl>S;tEiaZ|JR3u?d@zjPX>vmWBobRio`B!)Wa6Rk|7$P(yI8-!E5
z!Y?nh@um9InJOj`ZiIwl&bZ8SrDn7DYGi(=Q_xC9*_7})ehCUzcJ>titF3bYl4RSr
ze%m&uZQHhc+O|Dy+nTn`Y1_7K+uhT)zP{((_s%`@-K>bpsEk~{tlSY*xpU`U>u(*5
z_@k9V%>>m#CQ-KxNHa1egSq)GCs7G*Lj7Y2^l>OPuHs<CzWj|j5vg6?pRYD10LhHj
zd|Ug>#mg{F&I`30FKz9|7Lo^4ZoS};lZJ7TSCnTy^QGiAJkt-c4w598$7j6YS1?pA
z!Pnk;o2_b4OKOCDbUa)ou)TKwAc(#AP7f>8RSB968KTY!e12ppZ8PwtO3FD33eRau
z$2d;r+cL5i>L5vTb;_PhPq5hZC=8PcEanT_{d9~#B~`~yM;P~Ozq1?~YQm=Rnci2+
zS7@I`@^<D2@mzMM1~Vjo*s?m50~29D*71q``{3{)&pFgw=b9?eYap{m%Z3-ZbSOuT
zKKcV{S6cM)<z(gHR(N=~hwNMhwG@!Y5R)lEz6#xm_&Vqa4NKJc7R6h#++v3_8WB$U
zYm#3nruP#cFMh^c0ftiU#UhPYF}T$8_gESd1~6@F$<9{acNQ|hMBHXkypRTsi5vOp
zj=ZTPEWi+tnwy69m8|(VQ2Q}VBYS{vk!~m^<u5yaUig#)ovoqzTLk5`xSQUq=NFy7
zh(lZO3x=~T(ef>ZLYlQ~%E&^hz7ApwjV&-kPMIl7-yi(0ao&)E>*+Z<OC=~%(IAI#
zI6elxT~cUB(1(WHI74WMK2m^A1&z?e?z0#Ymj%~Pz?$8iG$jW9$}D&_WTiZ;y3K@b
zp48&1I-u`mYgSq91(X~kq|sWt2;=%)$*?0CKTX1H?aFRrxz0kE)=SUls(tt|YnrC_
zRmr{1qh4xrm`~5k4(VDa3ZKdgFGPvsH&y4NOS$I@eZ2!Tm`w8))T;Vr`))D9!VX^7
z?UeW=Lg-ivj{k1f8!sith@EEn(N!TvC6hjwRN9^S<VfV@?BY43_bVf*=T;A0slGxc
ztB<uoqtTDupMAE2Y`5vAo;v+JQ_^6ab`pKYL`$bN4MF!=aOvwoq;xl@bBV0Jh>BK9
z)lW|D_BTMUn|GOoiP4y>bJ((u@NSy<=JdwRaSVQ|Jwv)4DEF3PlFl!eC;FE@DJ(Jc
z{3b2k9lt#6%`^n<EwnDXc$Jnq^wQ{<y%2&g{7KviuY!z2$a&^xUxu^s*woH~=x5u3
z5w$60%{r8eH{eGq)?MWpmoB*ZrQfuOkm4?DChl>23^O2y!$z8rPx|daXkLF5e8NO?
z`^QHBEtpSb9q1@SHJCH_VX_z}2$F3YqA<AGMhPvebvM@8t-}pDSnR@FFi@Q980+}W
zBqYR?ojP)I#=*rv6v+%Ng4lsLv*jm03Y{}lkw9=Rk@V6)ixFz+8=TJ=%=ardh9QxM
z&MClMGZLMy3*b8TV@NJE45m^@wK0vGYBeRs12iK|WiI$O$BN%}EuuTT-V@T@9TLHu
zT~b?(V#d;rQB5<=?j;x`pb5FGeltI{rxQatlcOcn4RTs38S)=lQJ2zT;80<Cz|OLT
zVLGEiY>6w37JknARcPrLfmVJ#^nIMpl5k5M?`+DEmdKMbJK|UUubJHW1B%UIC>c_+
zgt0|gi;r*BVieFChf$ky)z1RPFgoA6T#t}ydTb_9qzUptR{PH?ODYRfvJvpRFxDg|
zj*n$%0Xt_55`KtmQ_ee-C3#LQXQ}ZI+%`SLtRA~(07ULK8lkvl`5s~CEho+)3r7+a
ziogoEunR{_+<(pv)K6qfiD|llH0KQ1amfk1MX~oc3a`E>Ud_b_aZBV>bOG?`Q#X-J
zf7@}-;E(jjUSn(u>&D`YuPaK*P^^+SZy@VlDs1oY8<{<x{VKeZ(Gmo3$Wwp>aiU5&
zS^x8}8keFc7$U7N;*cMw=hn0&%ux)m&pMHdR#~X#UIMw_h;Y)8fS+K}Y$hjPs-%v&
zm{zfyqh~{WTxi-YxbgY*zGi|w9+<{a5Sr~HX*Gdhds%=2nv&KyiV&(b%k_~On-E7Y
z@%k%P58WW4_8iiH6LG33V_^Fpwf(JFn!<(;V4pd2rq;qUbIshh92U2L6)k>XD=$T2
z!goZ?AyO18UZ{CLFn?mep&we_N)@>UQ{Re|%#v}you?qALwVmQgF<IelgyE35$P;P
zO)S1-WQ7cBRTE<@0mp|~?~$?>m5`n22dZX3lRM6?Y&3kU%OmG5d&Y#L%l(ahz?%HV
zSH+X++7trT08WeNL82&^9z^nvuqH{@l4bt#u$%!ea}7kN_?%_Cq*E_mdb=o%6dKk|
zQsjkX#Ll{Q`8U)l=_ikx)MWsHeBG!tw~CmONMayigx16~MYb?jgM`>yq{`$}X(wX0
zB>q}0?{YT$FDD+!6T%qbhSc_+6Cn9}x}%C$HI&MkPP0++M^~NnMaGLlFQNVK42b;C
zlu6$?wNXIg1D{ZHQG|9^1>B8gvFxH7gNq{cVx<n+rNm?xT-<C02ujkQej6@Dt&R-E
zs(2W-59?;3C0r)6)Ll2*kB02=g<qAM$>M(o9t_N)IX(W)M~p|w;MWBSZg8mc|GhjD
z=r^;MYeoOGnwvc^p3V@v;D)U-weF@Cd$htt=WHH!yJ=vWWb=(snAgj>(PGm3L8yBh
ze9i;VFtxVo;zk@I8MwDX$W|#*MqOl(b<9r}ho4iY(;Dlgp{--vwFt(1hJU(w#^WZ2
z@IcF@r&;~AFojj<f(f9;O`(C|bZ}npwgFHibDd745=v?-cx(Z(SmmUBDo9gV|NCpl
zlH*(Y`JsE;p5gDIHHl~1I<kRYOhccfK`CCH#t7>ms^s(4$ObvUop(8$cn4nXUQ5YC
zxXB04)oz!>Vdx>)oMe(1vS=&m@}C@CjRtC0&qGRsN)XtNy~x{M`)#T?$=o$G1QE79
zrD@B~^JMg+K!Z}E0}UdD)Cy;cmZ-KE@=2z77SHA!&$&BY+1R;G$~7+<3z3HHFpwrh
zW>-#2=Nl{9V`~z#JmjQNWyL%kd^?WR09X~aY?s`qE!m>$AJF<TLi65cWcY>;UU?d<
zi%RB(jvFL3I@M0~+#(O>;|I&i8>ZJ1d+HI!YYWgJu2ZJ&`Q0*VoSQ`*7`O6T&o8+1
zwP=Fhr<~p=c-ju4w0V3ilp#|cfL`{Pw-r{I9f6`BdSgXn?Tui?u(v~xuu)Jkyu~mf
zNx{v|PjTih($q?(wKWN-mO$;7y@wN#c|TSx%R)evWrQqu8Y@fRp-_Zt{2M2wyG|O@
zD=V45se8DbvXNJgfKVn*kwsH3pJk8sB%=71cWr6fm6ud)n0ex@j|QV*M?Vd_=WSyj
zDnZ|T`b>uPOh3)H$=zScELb1rlEST$T0Zkg8e7ewlWdRxfvlfjxZgj>j9$HGk{_x5
zB0aVk@7$ST3v=8Cyr*mOw!NP#c4pxmerHYwGLYF%pU<m`<)@{NRVJ01zh~&h90>hg
zT&#TjD@7`FKy03{k5_ghThiWJz_21PjAvE}Q{U5aXTnca%IJ2|CL`z)m0O;eu`rbD
z1K4dn<UR_Q#HCml=MhdBhaE_+b*LP1z17CU=l%0RUb+$8Vr)JYhc3yQui<`2dmNjP
zRL1Kx(>_8Qa<##vPqEvl%htNO!ipAJC-O-(7O*uLvv^Z(T<*KhT35c)bS7mkpjvC~
z*7`4ac-@Q`d+R2$Sja$ZqL5louYfCl*@CIK_(NSJ#wUYi?(`FJBCl=ODaOjLD(qRv
zt{5saJneNF$`(FH&us}Z2HA{l$jR>dMpPh|I<A%JD@<-)<Az2CA73@~s`S&_Y`Mbg
zjOK*)ut|;gvO(GO_4EYN$A|`j(mYMRfuCo`X=_C0u%=1-Gfu0epv2FA-D-8rw1%L}
zJQ{((dXO+Id!HRswT-TMSS`)<6DV6iv+o=AtV%)if}Ce(Skzm-cBD%$ix@aCLSePL
zRIoE-_iU<WayiTc7?78-EQ8J)GyIqf=%Z6iM&7CJ!8F-wqgpanoAzO|AQS6`iB%J&
z{dAjXJP)UC6Y^iwEvwUx{!P<}peZ=G=yK@@#REvj+v`PDOZ7WT8|Klv$&yTjpH&Ih
z{FdoeLHj*+@ynqTuG&)$08>`9%f2FRW0mF961<EtKx!ckuUj7$hh2=*e!pEgE0t5(
z*La_*1H`HC(4gskt$DUl+L(`?9%M-m6;hxn6XB9gv5>j)`OIO43s?1<7WJ1yC!hO~
zo7_)T95=UptUeZqIOr`j)97P_I@}BKuX*>&g272%@CZLI^M2XFQzb~yJnUE_n2ZVx
zsQ`hfFX%T_;eqyzR%)k6;Q-^p>24cXFY)ky%g|O$8PedeQFR8y-wJ*cjN)}gtnbMx
zA6j$mHaQMV0Kz^R>Y)#Vf3S0uua*T|)eB(KXuGm)D^{oMJ=sbNN*i^i-kE4LjOLhn
zCY#F5@f_yce24*4!-B!SQJGi?gl+QmU<BP}jHHznLrsGr8GDLw=H9j$Z=%v8cMDAq
zB}5^fE?8eN=w_hPYh0EzQ*gDFZVYEfZP<5+(Y<3=u;CN~p6gm`qL{BP@a^&24>Z-w
zraLCEUNi`gz@eyE%j=Q#{Hm6v3Y(Zt#UIA&X+Qm)V-lfQsK`W*F6xvZ(G+msRBV*@
zmosdEj3o|*PkdvHzWA#5aF>fGx0kP{?8rOPl^DCw$c;bx><n8Pj4aX{b5OS31La$N
zUVaF&`7>{fLRJBU>QdYWD>+4>ILyd0x@3v%<p73=jo^XL^!;cYY4<k>b9KT_XpJtY
zj?M$|7E`O&ppus}!=v0v+nh?R@ap=c^;#?e8n*EMvYg6F7k>-6Ah@T-yfd?DRn^<`
zQYfBz9K(5_+#4+M7(DdWj3dQmgj}lqVcx`*Ys|bP4vN<pNi$)C+rA<N+Vm^m#}T@*
zXLp}dkWl~M_awXDUmE+N**APvpIdcbAnZ7=mfG9}2q!EljIPnGw9lcjs6}ea<^%GF
zG0L}I`K)y8EE{mx1(4VhFfWS^WN)GBNOJWBWO(Jga9Q_Y4yeW>&g8xAL8QAM5PbTE
zJN2Xlud*{?)sed)UwrVHc$bGWMR7zKE~t4A>`6V5`Zi4qRFM^g6nG6p*&&S)q`{e+
zaF%{9HUZ-4|Ikv-aEz`hvAwE#w7S%&WtV$at3|(Naq2N%MxC>b_@+);boxy~T16JL
zyM%b=5ED@yr{vtv0~gMC!N#WbLdjO55OI{WVdkR?$WnaV8YHD@z9ilpR#MOJFcFgr
z&59$v&johxgB?g3|8?>jTGFf^3xsU9J<6(Ekq53?&iiRN?xpYAXu*9<7#1kNk^h%@
z=juRuJIVJn{W}=H7GM?cw(*gWi~7P}M8KoN_!)ENBPEkm!kS<jgh?6!0?QG}xVIc_
zp3o2Mimnq3F!jARh9zA_Cr}dux~=wchqLhoZ!+Xbw%yLM=K$|9?&c1SxYJD_@zeaG
zekDof@KBx1em>sNpF}S|k3UIP(IXZevFcd8Z~`>DlVrGukUXm5KVknleCIao<V=SK
z0Pt0e?(eA+|1o^W_&>vU=G+NqTV20Bf?H@<0;p*)+AORJ^QaABCE}n)bP`6*YOvX4
z$nqe4Vg>i+W0MraB`J!?2}LOsW|j#@kQipJwB|~VZrfcqd2F-LkD{gyTy(w99$(%&
z`6w!B&yFN$<3Iq$9UQBjfPMKv0q(o|bN8);k$`<k(s8US*7I*V9j)3oE%E%iuIg5&
z&xM)4z4yKSP6G&Nl?_?msgy4?DaN<~A+wxwY09Yr+n`F3fsk51xnCT+mu=hoCXUCw
zW*ZKmM+K1`=bx5cIs@1Z`<)OVWw2}L^HD6?C*p^pQ6hWiWXr&lLa$W>qH*jGLkhVr
zEBRnfn?{OSNx&uJA+{>f>G&E@34hdG2E2UnYyVDtk?jsrKlOwGqqNE9uptavtLGp4
ztC2Zxua?#Cuz);nm3*9}HUyV%&m)F8L@&hZLlJd&B$W%*gV5s^5)%b#oMOL&FQv9<
z7<=NvcCTNfkfp2A(bOkk2*PMD!p+p*se13k)K~c@n&sS64(v#y^3gaC`Jx~wl!WjD
zl74tV%01UI7p4hO$19yHPiIjo)b+F+_9oWI%)hFOU)S&l)oake3t7k_)P`S6^G+ZZ
zgEv)n(l0G#3+}89Jt)Z`6akN<+}%n--IvRQvK6!H!a=J@r>y-}BFV_Q&|UJSX-4K`
zXH^`?z=bT+D6@4M;gUEPR9Zj4h7ki@N~QzR(}<W{xzT4FDZgpK9b|94DfX#V>Z^NZ
zJZ*eZPFx+2W)Oafeb6r?(^szBYAEe>C<C^|va!O9TQuVC;2wP<rh5++*YY594}f=6
z9OvRIdJEaFXWfu<5US3@P5Ri4@0JgJHB9h(T#~T2Mg4wRK3%$(Eso5M8m>SzE`4Q?
zwJ!slCw4M_@KaUf&^j_k*yz}5cA$2OcJO6t+CTGL@x2okIgsy{qh~)F)yga}fLs)o
zlq%v}>lWzKZQPr?4arg~pL(-ENp6T)`UW00i)3P@a_-Z?P}z^d#+c_Z`?qhXa^F&T
z&wtUodzy{=tShY|9?cT<o+H<nzvtCED&3dG*%+V{K19wc5<9IKwHgN=fJb!!iF_z+
z5h4z#-VBZzL=8TG+1T5qaf@y->$<g!UAbVaEMwx#!?}ScT1s{($9zA?a{CO)dqNa=
z2!KW0T+|j?HBJV3UfLQW|ER$hx%Ky|>GSE(kNbgZU8)ahE#<BvobnLMWgl>ZI3?>t
zC~+Iwe^VqW2|-P&jV$1U^O58wn(6A?L%0A8UzpQvk~w{OO&J$#&B*5ckxUU%nC(&a
z+K63wNHBcf--c}+4!wjrY-;j=w|6+zlrXD}=MoHwgugn<7Sqr%#8U|?F*cmD)0as^
zUv8h5U}H^E*!&HRo#%FgJWU?GCRZ`SG3Ov-ttS+e${?)nb*NQ?s@a~yTO6<dcEh8s
zRD6Bl<)=IwS-^0bF&htAvk&jWjkGQuo#Y%aRAQ$v*nL-=^x%Me-cV!DxfX7UB<=0f
zb1Y1g2hiYArgzw(l1=o>#%3nCm@1zZRN`RkSP6zJ)4;e2;jo<?X+`BPWq`c;W7p-d
z?17yvjYY-x>zFMDUzoo0gO-lql`%G{BG+a)ZiApnm+Q&fbyP=lMkQ(ju1((WQt6Ux
z*;P{ylb#BmK{A6!0%B-Nln)v;&0IQ5g3F2At9Xg*9R;3-DRbaaX$5<{*MbLacn}0`
z0I6>ug)&$B&%^7hpcYd3Ue!-j<2?)?`T6V5^!wJ=sI%Gk_t~KISe6zC_MWkbk-hq*
zb>%ixCH}b8hqiER$0A4jVJLFnENPb+4SzcqdU#GT*Itr_VJM`VRf6?3@E+C9tXmJb
z*fmNe9WK@CCWQJ%c1ou$MSR`##NwxPrif;qji<@v7g(GJL9!)87VhDyt`>Ios&@XW
zNOq%a7EiPU-ScE4RzBW3c94$*tdq?4>r!_@PLKsA-aXw+oEtMPxBgumlUvL=t*9)b
z1}V9uw!c*Zq<16zO5bI)k_Mf0CVkt7YR{-0UFEWxHjUx}&{aK^#_aVAd+~5nC7_du
zVS9*)2EOuh5wYfYqci2L8w%P~N^3MVT9uOTDYZ8zLauRDG!x~n=`s>{Q@|*4=bmP<
zR8Oa{XP}F=4TIS6_ZuT>g9q02?;t>_GxDzxVAI2<o8aVZ8ltjFOV^1Y*zyJdDsrXD
zk=6M-s{y$l<9xg%O<%6ZP=y0@I<|yg$6twNU6E5gI?1<HY|fEZ8CYj8tr?Q+OBw8k
zG?EXqxz_Ny+)zI;Bla4d>e(TSVUe6|zpXGX1QnQy>%mNYT1oKfbiF9ZVsWKvdXt34
zqblBuLA3-)C&uSLUCbfocHB1iim__;a8_Silgo*wCr>+8-B$Xr`IQog<rbl7=h5=8
z65dPes8>ClEP9A&^x#oKwdtQ#lo277_*g&5xi9XcmzXk+Vil~GStpegs(=Tm;H)#H
z>^Q^mmTN&$chpq8YGi9$LZMv{{hEJl)Az}+Wn>&ay$EIT+L}$Fzr{MO9TI5!(c8Od
zLAJ=!5rY$?-Lt?NSuRwDy9{4i8T}MD8IMrkq?r_KE0;r|mG;VjYVwTldLG)Cdht^4
zJ?0IYU)A*L!3*hSN-65F>DR(4<y^?bn-Kv#MMO1_2U?7@pK!ZWp|#v8DM-)QWGY+g
z<exKEbwLl1(V-88Y3E;g37PR;!+&3Y;@Z<WN^y&Yqz!98ooASzxOQg&smO6q+X?u%
z=@P=hQJoDe0b%iR_M*Lp2Tpt4(bnOR`w}vOk|zHoF$P&qFC2qR;6Y-$v477lOj4IR
z+CuiwB7R9S))>Q~l;X77x=>8RU=Nru$0^N6w@G+*1a6IK&BDbbX`0fZS^mO4&7~st
z9tVd;ktIl&Z-9g{nOZGl<(x3OXKI^QKOtQyOp-1^V*2pt<GW2tXcqNbMwG`G^7(qE
zY>~AiNZ0K~OP##KWz0}LwX$5&wm;aYt}H*KUo6`a@30wPA?VQ1!i3aA%-Ltn;UT|u
zeCgyN{V^e8j+kEZ_**{ckBTZ?+=Y&|k)j2!`GhA&^zcQcKy)xT4Q)>s37Yv8L|qtU
zOQG*>2SargBVB<mP-CY3{7H=>*YW<P+xL{s!gZ}ULCCE{3-xC+lUyPes}^_I!sY6g
z&?^zy61R0CVZi5O1zk~x1P_2l<?7z!l=qY6@Ho-PEW*3|v*cnm;!(Un*(*aR_R`&i
zj=iAZ*au2vIBMN*F@b`&4Z})(v{V6;nW<jj<~Upl7(ff!zKV|c#zK%f3DS+1K~}(U
zm@kr-0S0!#ov@<c4a?Yn7salOI#B0;d)zl+DP%;mWWhSLHKoU73hr2mgbl0!d5)r4
zs))(E46#=RaWd=^^H-XwT_xh{*9`QiU@C)Q3$@Q5KW!S>x4iSU?pum?SS&tiLPGK7
zrC%wlL~V-qV~h>OXpf*wvL>i`EBJ2J4rs9;+;uqNXlsCZfdXzR3zhn*vLP`nxfzOQ
z<HNPUI-zdSzH91tSRSo?96^VUq&covr_;mWR>b7Us{?V9ruMoBgtgNCxO3Hv%ni)}
zs5bg_Hz87538qQ@_I#D@g`;g2@tt+In>H249!F@Ngd9MA4#3{B5Fwj89)uOS=57v%
zn=nXF#$N$*!2}wUS?U$Ns@wx6EU}9?4YY!7K8AF$u3LAR4)#S<Fe5b`q@EsX%M}K3
zv}6YU%^>dy%~Yz3L_~@6t_to&9#u8<81~k#sNUD39$eo@Kq@j6bnZRm@KKP`F#a*<
z*sv6S(GaDJCBkiZH#tmo=K=+gm(*2m8LlwA#}es&*<K4Z!EE;Uq%@(g&d3PZsNqC8
zEVq-~C>tiRtv)QQ7IHm^oS`&#BehDC>t|JD&ex#a9Oi+TABkOzjWra1fJ`br%Bd{v
z#1U;3>YgElBo{k{rVgI$oqWh1N!8C~3j;s{pJH>??Kv9A*lXAZB#sY=ep&z&wHkO*
zE@F4Sq<j;5rAk&DnwlbBJ#a*9uicY=b@{U4re1-R6OJ$vuY=)$E#eLR6CNg|WOUl!
z+Yntz#kr)gh9Ez;bj3X2e^kpn6r688gKjhI>$}0ST}fZ^^?w`gP5fDTS2i3RJY58X
zBJ}!6^Fd2^b6NH}ydzR9%aV2J=C)LDs|2$S>bhrw+uP7A7O@hYzrLT#u_2%3zCS|2
zG8TJ3x{_I){eVFrzQNydKi||CHxnYm6k0~<U&L3+&Z@P|ZbSW6q;79T+!`M_R9|<9
zY%^iL&c3m&#Id$H*RvNyMQrAr)oPe`Lk?BzRp3P%@W|2Kymh?Bwi-|p<DiJ15+BC@
z^^8x}1O5csFzuVtJ+#2^ci$8gSrB1w^ERI9TOt%3@UWkwT*a5@e3;%IMCSw2AM-)3
z4`4!|AT(2GyTJ0CNKXcbe2VDfE0%UoLsG>J%5c;#8uTLD{3d%YC0%!31z7{nf$(&$
zkA^foniIO}sPN6xcpa3sU6=KC9nVMlJKZJUT9~mj#<oJj-px73z53e0u~jr4*@l{=
zxR6Lw1?WfBhIt+qg0}WLU{X%mQ5&ybX%-NX-f<E0UoXYIG}BJ!m<K^F*|zlr)C;~J
zM+_`5LG&RB{A7%#+ZV2G!U!iK--($d?BrdWmH$-j`JjlibU12DOHtQ6K}1r0`pG*r
z4+kGB0|~|Z)5T_qpl%nqT~U8V`T+Ws&)d+ymW~6p)3+@B$c@M;yjykVqcCh6Q88;y
zZ_FEQvr|IX$0Cvg@hWUdJ_?oeZ4;WP(owsN{5|dpmD103O!LZadna-&sA-0tONlcD
z&v-M(6FUNZ@aQ7jgEfG;44=HuX?Lp8m%;&qrk_#Oz@b#mLhKD#`6`HzGG{7qDMoUN
zZ-MG@?UwLCTy;SB*U&V50%~?hZ4B>c#}I0%XiB17aFm|>V-R?8L3ek4_k`u%p7)*C
zFdJDQ>R`CVVE|GII$<HN)-NR60617d9}>`b^}f-P&XP-8frzb~0mG~*&Ft>=UKxbv
zF6yD(;P&gVlkTKd4P#&i@4m6C7`g8T$j<Bl-Jhn<+GWi9l3hQOZ<2{Jh{zGQo`1-y
zIk`l0@Z9DGgG~0w#A@Doc^(V~wz%Mefq3$I=Dtt`bx#2Z^ljh1;A{1NC(ldDz$5Sg
ziLddpHK}jaWSTuCi#E|yIFJ$E4PVdW$=qv?#@=h`35zT*T<%as<a>yunE3PnjcYcU
zHAdyKF&`?OXnCK}eZ_%CDS2oGH*$AL26vgV>?3!|kg5!G-+S*dr7^h_zPy@oy30{T
zV8}bK4ND8lucfXDtFG%=3piGl>$l$AaW6AAD7O?Y-Mvfy!GdK<1e0|d-B#1DHzp(9
z?p!V_^a>&qPh#%b;xc|ImF^!7Q7*<KSD-tb)9T<{8E$y^goC=hW2<(XydKt$`AtcU
zbAgWH>tW^~FiejG`gVVu(SEY8K0ExT#PE_pjSi$1l$7llxdSsg8(FES3ZO-!0JG^m
zU)9$o`O{#r^ZWISkXtDBdMV4AGlC7%c-)}3Q(Hev1voeMcn4MN)DN_P4W__Fv+y+X
zc)BtdME!$1*TD2h;yMWvXDwtoS-)6#q|A;2Suu1i_Y;q6f{DPoL)I~gsz%NT5)&#Z
zT(4Dg`Ldm}_s5zSY4uOqAp%iDvMP=M^deQ)Y1+bqr#60#%W<)(@9E`Di7BG-+xNgi
z1q+ia^p3cSf|Tk}lQoOCNd>c4<xK}w>)gj&VJX)vkGn<`rzA}<_u1y!Xn}dgKf{EP
z$ucIdTX#yjrpGnTD_)^ib#8%?ZsbAl%`8!q_Lx+A^vGug8|CiAywgfk5QQ_GWnhA&
z6poJL*QUDEP&Asw=in;16g(wSrp~jo7+OEhrF;(cZr)x8NDgY|b{Z)vA&f|EwfK(C
zSNr46y}oPTng%v3-zy+xkol0gy0ZjqTRsQFu+2_=uEM@pgQ?@mK%o1u^0%5{s7Stl
z3;wY)BfNynqmXZRXk)b+_3=%EOseh}e0i*e3UR)f)1^%8lU?EA2?4ihts7bMWr9On
zNHU~2vDs1?XN~Hxj6%pMp5;WixuaKaMd?IAAdlL})R_p>ocEM@RloGzcoR1B7G!zq
z305bhLj=FRu2Bj&nmxG}$$@7lyxO>Ag5>nWP&fP6rsgl4S%v9kI>k1v`N8CtCUWmG
zS8>qCqLeDQPyG45dHt$ZRKE1<#)A5Z^d~L6PE#YBQn-)(5xRV(zGPGbVMFY2f)(S)
zkFv~SSHeb2<%;FTlF-rnmWuVg^U0d&V7p=Z=4Ex%U8^AQGY(D8C|(fj7IxM3WDCgW
zF*pz8>~p4pb%a9?g3*-<#N@NO1`mQ9mwwo-cixD;;470)^pYjO{$kMpAM0XGekg@}
z+0aueaH;Kx91hJNNYpPUG1jTvwk|t-SilI#IU7cq1y^S-*8zkkwJ!z@BfLzrJzGTF
zwoSy8fw24d#PAxY{pK{#S`TbB!y__zWZet1us5}aYJ^Ow<1=p)=PKym(`mL(^&)cw
zG!`>5Hv)~+MhG;^t-LZqhu`zROe08MxbxfiyFS#d)xmf1pM~<_4j_6dFJ%;`tTgy=
zV6sG)`glrp13D}He&XtsxN&O~9LhG}i23X_&DSi%szAiutkkU~L3G;ik|A8HO@$VV
zQI2;@atLt^Cr{BvlC^1|A857R3#a|=>dG$?h<k|wE&@v;k!*NUo&fj!b}gDU$CMj^
zyPo0&3+2xdgE5S)q2gr>%Ru;qvjHP0$gk`<;D$Wb5wPZJ+ZA?Y*9v6Lqb0M71Y9~K
z0fJbo?J%dS{kx*h$b$BO_DL2KOMb0z85f_t>tK?bn6@xP9zD^|7D9Pv(SkhCp4b`3
z*lQjB_Mc{p7(&8jBI1?<Sw>>#1beBTw+0ZeqBK)sFRFolb9ogzStg~DP^ri;O<_Rg
z^2O=&G4KORV=RhI5;5;+-NpuObu!|?0L$Px$2&Z^wW+Bd)g><WELWsjNWXo?3obTT
zLJI_N9LqCQe`y!=;$jHKyt)KL`S;PMVj<D`VNy282bY-Wp*hc1<|f=xqb)v=d}=_g
z$RU$Z8ejI*+ZzJF0u`*R6jffM`q{!^Vzv_r4DRh%B`JEeKQs8j_b3U<(bhx&x!J8+
zV{<~@&vR5ysPs~Qu*1f%DQSWakx!SwJ$GxU-xpF~Qc5W-t5e5?O5B!M(9yIZsIu-W
zW(&sAS*%`0(d(R%mI3)QKxK{EyABb~m`_^;&t~NV=e(itJ3de5dB|EGBqR&UZ3@8|
zqFT*aCT5|xtO|qbgxsZTJZg;0FueEVDwEpOj`D-Kb6x611t{W=#7?-_al=><m}<}A
zR|&8-xwL<fhUdW)ZAZrV6qWbf_I})U+KoTcyMjX2zb57|Yd|F>rn=6dNiObnt9exf
zBo{$q%;yG(Xut`_L7?2-Ggz&pi!<%l1iL;Mt|8C^UT6JwCNvgc_y63V;S<)R0N_dZ
z9LfFU`S5sqeCql(H6B|YfSbe|2ei~}3XF51WmTp5O7f2W&k^P*EJ?<buj%Zru>XC8
zndx66%x5rH4o3%Zy9RoC+xK^NkY8&RJw3hfZ9U-daY{={fbcbsR`y_}aW{`wz2f@(
zP)j`@{NyEln}zKt9FJb7!v_fMc4Mz*!j20)!JC@PwJKhh=W71#@6*cf7hQNhs3v-L
zu2;{{)8*i3)HpCO$RK`2T+J*1pv0J|7#IqopkjZ$LISp#kTrwBG0%#8K6-l}E6X>B
zvi&Y$=+cs2VPUwNnhsPLo{uV68jL-2Y-F{Up0rIWuiCs@Ys14jJe%Gboxp<M;g{7g
zFl6l@=;_JHx#;Pc@AkP|_t#nLtd+LBtrxk$g83p*T6<=bvwIAd7A<wvFl5j5jEwZ?
z4gA-h)HHGkt{+%+Kk+~>A5AaYsV>j8A3H~9x3YT#K}Iw_VPW9i6RkN@O}PbJXn;%o
z0zs0yj1o9-6171cGg;!M@EpP!tYW}GBazuHu!`uJZ*1Ld_;x(rBi7bjoqUp(3hL<J
zOu;!pOlI2y!|<}I+0K2QvfXsLm{;JiP;pRk1pQ4J%P^vC0nlK;3#LehrZ_sM^12%3
ze+y0E7BC)ubaISMz=QaNc9MK}=JP6H`fSw|S9QHsb(O;~lRN`MKjPoa|Jwew%@4WC
z#H>~SoG?Adg_z=p!L9wSwa7uj4ERUZLf-JAX$L3(Kp;K<0Q#5R!N}I|FOS2QZS!k#
zzm2}R6|IGxvFU}1r5&;|3U8YS8-?`*0yzx|cyAkHcL9wc6%ecWYEc4f0ZrA9V2OAc
z<TO%!m$@X_%-9!z`w(u=&l(gZ$BZBHGya?kBK%_D2+$}PgM=2>EZiu%z}M>=1(5QA
zvAUCAvk=}pGj2B=rZ_x~Z1RTkbo-`nkUSviVStTY1@7uQ$7~uZ?bypC&rw+&0gjtm
zdUpUUfy`h=^XldeG+M1%B(|WbBmJ3&gPD^xq0l*SP<;a|c3OX&K+vPW%i*nmRuN5n
zSttNVLEVOlTCiiA0ImRC)Di$4O{aCHsY7DVE`_rQ4E%>D5|+x^Q*JmD>6DpSWU1`a
zB6=<y++vytvQ?6es^oSp;6{hZQT_doUDD0ya14ge;2|VVHv7FI@);1qs@T9T9ZPcE
z0S(|PHpr4V4BPuIYL$tyLp^=e8s%S&_W4AxEFxm$K=nf02I4f7r#9D4%hdh3kz0o7
z)+`Mk7WK4^6FHMFtRQIY2AN+s;!r)n_F}VdARnMu@Zh)wYoZ>YpD}a`sL`^ilG949
zuh}bjEeo+ikX2hpafj<-tB{2zDOpS_6(JHh7oOHPtItCZy6?e(6HzUs4G@S6MF(hK
zEk@^CjFKYhzGq4$dAyT6q;%5Y$UhJsiZ+HLnp#UcOp~liF0k+WBa}xC?2EZ;4b(R=
z#Y_bZL_&i=2;lN=AOw!-0g)y6gEGJny0>qK0BC^RL781Mx$pz8f-P~NM2#YcyBj<6
zWk!|3RDM@w$DRumM`MWhY(I*DVoXMf5u}fO-sfffiIU$CGme>ZCa#fP>Ew(*Ng1lA
zRH_M+biWdt(eHk-3)S`79e-4rhTBSON{}UI56<5U*Pl2Sf3~&s@G&eL`EAzebLY;>
zTHdMd<Hq#KyAwRS<836+pRXsozYIN`zR5T4JJF^$2|NWwz&=*%by82z6z_acoTF;R
zF>K|Otg9JWU_)XuWv-V-uLZBg$I80lrq!m!$8{Pl{3?kJa#O&o@7l=4E-yH8Q~KE<
z@BV|mt=%O_M;Z3zPd}HcdF-krcd)Yt0Kk9&`c?WKc`FRd6r*3_{Y*xf_OQmm6vvo6
zSDSH5Qk4VnV^^4$r`wh#tr~YPx#}hS3Y<8B3rpvp%T|oR`$(@WZH>Oa51Iu{=TOhW
zy9?4wspbz(M=5?D6{q`m3VN<qt~&k3R>CJPi95@vJm+jxJvk>GAoIhAreWkUtH`~w
zn9;qAmvJ+>wNga%E(M9_InH93*oOj<4wqsK^Cu3K@izsVzRYnK)^()ZVk5of?<J)J
zNqL3NQU^zFJV5kaBJnLxh17)03M0}1IZiu>4wb24%GU1$h-rUzJFRkr&XOt4n~8AM
zIbj3R+C+%@*uad^+4R4b13HHje&ug$c<KOvMxRy0Q}Cr<>7HA0oiY@Kzd|AfhqoJ5
z_igaTXYeq#x~N^YMO}eot<=jp-rmhnE|@{qEByRsV>ylAgdGs}8_W<Yg2Fx$QH`h%
zGxhizMnua^DgKAPv0C&Mr`KrAJTuBU1{8JQ59%G)EIspa;A3!u{g9|Jnps%g{n?+o
ziLp&4eTNB<Nf#SL86lH0<5kFOAq6HTQ8<Y_<~Io$ZAFvq9*gJxy&mNQjd8P#9&S!D
z7rP`+?i=5FG%s(f686Ov9mY-<L%w<M&orEU^YGj&D$epgTsg|}$#oc8OxE!>UMs8W
zcwSuN+H`BOJgWT3yLmgZ>GjI@em>Txdym86UjFF1bz(c|`MUA=ZtBDCv$I>(rTy~$
z<v7T5V(5lV;5E};1_>q!By!h4?_*W;i@n;jcj^mAJJBu+{yC)?AE&=k?dI_q1Nl7Z
zKy=zKRFaBp`XIw8C#6W)*3i59sb~xwL~_arcY<TXPGlPRY<D0D=U5G2<PU~rmgDzd
z-#(c_AA0%Zr2v6Yfd7@P0ReFSW!8TyuAEN)>JPkq8TS9>S^VF{_iGyffTqw_l7z91
zqpp&how0-VzpPz<0^CTY_@BOZ-TVrnfc{_g*V-3w{0s3v2Eo64fqx1^gP|48zjm4V
z5+eRX_~ohp``Tk@YyVe_#h>!KwKep!FS!*6006>2<UwCs{_oPIZ)f-4zICNwaBbt4
zh0X6v`p<)BeKo{iu_IpyC+n|s|8H{d>d;h`uY;?9?fmtq{{I$Vz<{qDj=$vpbLRgE
z`LC@2K3|pUK=<XNQw9Zq|3A*ZnhzP~-y{b|H!I`6vX1_<4gOm-`v2AY0@iW;L;Oc=
z`ad~;hAjQfA^!2doc~B*`BVO9YRlhpB&UDN|HyFplkjJX$=?KF=YJFak!kX${?EvX
zzx7M5|JMH*K=CKyPxJ2Ih-shyYUTYC@TW`mZ@@3#{{;Lu&+MP9KaHn<vvU3Zll4y<
z>YtQ9eVTt$n*IOXD*te8{;B`dO!l{4G5Fv5|CpQdm$mFq#Geyd{zf3B{r`x+rnmfw
z`16kCZ-is@|Bv|V{^d`^pB>x35izy@@n7-JZmzr(=-03NUpYi50F+<7V*loUefobW
C_kpSa

literal 0
HcmV?d00001

diff --git a/backend/tests/integration/tests/image_indexing/test_indexing_images.py b/backend/tests/integration/tests/image_indexing/test_indexing_images.py
index 499aa03d984..de2d2a73a68 100644
--- a/backend/tests/integration/tests/image_indexing/test_indexing_images.py
+++ b/backend/tests/integration/tests/image_indexing/test_indexing_images.py
@@ -21,6 +21,7 @@
 
 FILE_NAME = "Sample.pdf"
 FILE_PATH = "tests/integration/common_utils/test_files"
+DOCX_FILE_NAME = "three_images.docx"
 
 
 def test_image_indexing(
@@ -114,3 +115,112 @@ def test_image_indexing(
             else:
                 assert document.image_file_id is not None
                 assert file_paths[0] in document.image_file_id
+
+
+def test_docx_image_indexing(
+    reset: None,
+    admin_user: DATestUser,
+    vespa_client: vespa_fixture,
+) -> None:
+    """Test that images from docx files are correctly extracted and indexed."""
+    os.makedirs(FILE_PATH, exist_ok=True)
+    test_file_path = os.path.join(FILE_PATH, DOCX_FILE_NAME)
+
+    # Use FileManager to upload the test file
+    upload_response = FileManager.upload_file_for_connector(
+        file_path=test_file_path,
+        file_name=DOCX_FILE_NAME,
+        user_performing_action=admin_user,
+    )
+
+    LLMProviderManager.create(
+        name="test_llm_docx",
+        user_performing_action=admin_user,
+    )
+
+    SettingsManager.update_settings(
+        DATestSettings(
+            search_time_image_analysis_enabled=True,
+            image_extraction_and_analysis_enabled=True,
+        ),
+        user_performing_action=admin_user,
+    )
+
+    file_paths = upload_response.file_paths
+
+    if not file_paths:
+        pytest.fail("File upload failed - no file paths returned")
+
+    # Create a dummy credential for the file connector
+    credential = CredentialManager.create(
+        source=DocumentSource.FILE,
+        credential_json={},
+        user_performing_action=admin_user,
+    )
+
+    # Create the connector
+    connector_name = f"DocxFileConnector-{int(datetime.now().timestamp())}"
+    connector = ConnectorManager.create(
+        name=connector_name,
+        source=DocumentSource.FILE,
+        input_type=InputType.LOAD_STATE,
+        connector_specific_config={
+            "file_locations": file_paths,
+            "file_names": [DOCX_FILE_NAME],
+            "zip_metadata": {},
+        },
+        access_type=AccessType.PUBLIC,
+        groups=[],
+        user_performing_action=admin_user,
+    )
+
+    # Link the credential to the connector
+    cc_pair = CCPairManager.create(
+        credential_id=credential.id,
+        connector_id=connector.id,
+        access_type=AccessType.PUBLIC,
+        user_performing_action=admin_user,
+    )
+
+    # Explicitly run the connector to start indexing
+    CCPairManager.run_once(
+        cc_pair=cc_pair,
+        from_beginning=True,
+        user_performing_action=admin_user,
+    )
+    CCPairManager.wait_for_indexing_completion(
+        cc_pair=cc_pair,
+        after=datetime.now(timezone.utc),
+        timeout=300,
+        user_performing_action=admin_user,
+    )
+
+    with get_session_with_current_tenant() as db_session:
+        # Fetch documents from Vespa - expect text content plus 3 images
+        documents = DocumentManager.fetch_documents_for_cc_pair(
+            cc_pair_id=cc_pair.id,
+            db_session=db_session,
+            vespa_client=vespa_client,
+        )
+
+        # Should have documents for text content plus 3 images
+        assert (
+            len(documents) >= 3
+        ), f"Expected at least 3 documents (3 images), got {len(documents)}"
+
+        # Count documents with images
+        image_documents = [doc for doc in documents if doc.image_file_id is not None]
+        text_documents = [doc for doc in documents if doc.image_file_id is None]
+
+        assert (
+            len(image_documents) == 3
+        ), f"Expected exactly 3 image documents, got {len(image_documents)}"
+        assert (
+            len(text_documents) >= 1
+        ), f"Expected at least 1 text document, got {len(text_documents)}"
+
+        # Verify each image document has a valid image_file_id pointing to our uploaded file
+        for image_doc in image_documents:
+            assert file_paths[0] in (
+                image_doc.image_file_id or ""
+            ), f"Image document should reference uploaded file: {image_doc.image_file_id}"

From 718c820ece900184f571b3ea47ae6ed1c552eb3a Mon Sep 17 00:00:00 2001
From: Evan Lohn <evan@danswer.ai>
Date: Thu, 7 Aug 2025 17:34:35 -0700
Subject: [PATCH 68/78] refactor: file processing (#5136)

* file processing refactor

* mypy

* CW comments

* address CW
---
 .../reporting/usage_export_generation.py      |   4 +-
 .../onyx/background/celery/celery_utils.py    |  45 ++++--
 backend/onyx/connectors/connector_runner.py   |  22 +++
 .../onyx/file_processing/file_validation.py   |   8 +-
 backend/onyx/file_store/utils.py              |   1 -
 backend/onyx/indexing/indexing_pipeline.py    |  46 +++---
 backend/onyx/server/documents/connector.py    |  28 ++--
 .../server/query_and_chat/chat_backend.py     | 147 ++++++------------
 8 files changed, 154 insertions(+), 147 deletions(-)

diff --git a/backend/ee/onyx/server/reporting/usage_export_generation.py b/backend/ee/onyx/server/reporting/usage_export_generation.py
index 97ec2d03c3c..2391dbc226a 100644
--- a/backend/ee/onyx/server/reporting/usage_export_generation.py
+++ b/backend/ee/onyx/server/reporting/usage_export_generation.py
@@ -67,7 +67,7 @@ def generate_chat_messages_report(
         file_id = file_store.save_file(
             content=temp_file,
             display_name=file_name,
-            file_origin=FileOrigin.OTHER,
+            file_origin=FileOrigin.GENERATED_REPORT,
             file_type="text/csv",
         )
 
@@ -99,7 +99,7 @@ def generate_user_report(
         file_id = file_store.save_file(
             content=temp_file,
             display_name=file_name,
-            file_origin=FileOrigin.OTHER,
+            file_origin=FileOrigin.GENERATED_REPORT,
             file_type="text/csv",
         )
 
diff --git a/backend/onyx/background/celery/celery_utils.py b/backend/onyx/background/celery/celery_utils.py
index 5c4d17b6109..0e7a36ac4cd 100644
--- a/backend/onyx/background/celery/celery_utils.py
+++ b/backend/onyx/background/celery/celery_utils.py
@@ -1,3 +1,5 @@
+from collections.abc import Generator
+from collections.abc import Iterator
 from datetime import datetime
 from datetime import timezone
 from pathlib import Path
@@ -8,10 +10,12 @@
 
 from onyx.configs.app_configs import MAX_PRUNING_DOCUMENT_RETRIEVAL_PER_MINUTE
 from onyx.configs.app_configs import VESPA_REQUEST_TIMEOUT
+from onyx.connectors.connector_runner import batched_doc_ids
 from onyx.connectors.cross_connector_utils.rate_limit_wrapper import (
     rate_limit_builder,
 )
 from onyx.connectors.interfaces import BaseConnector
+from onyx.connectors.interfaces import CheckpointedConnector
 from onyx.connectors.interfaces import LoadConnector
 from onyx.connectors.interfaces import PollConnector
 from onyx.connectors.interfaces import SlimConnector
@@ -22,12 +26,14 @@
 
 
 logger = setup_logger()
+PRUNING_CHECKPOINTED_BATCH_SIZE = 32
 
 
 def document_batch_to_ids(
-    doc_batch: list[Document],
-) -> set[str]:
-    return {doc.id for doc in doc_batch}
+    doc_batch: Iterator[list[Document]],
+) -> Generator[set[str], None, None]:
+    for doc_list in doc_batch:
+        yield {doc.id for doc in doc_list}
 
 
 def extract_ids_from_runnable_connector(
@@ -46,33 +52,50 @@ def extract_ids_from_runnable_connector(
         for metadata_batch in runnable_connector.retrieve_all_slim_documents():
             all_connector_doc_ids.update({doc.id for doc in metadata_batch})
 
-    doc_batch_generator = None
+    doc_batch_id_generator = None
 
     if isinstance(runnable_connector, LoadConnector):
-        doc_batch_generator = runnable_connector.load_from_state()
+        doc_batch_id_generator = document_batch_to_ids(
+            runnable_connector.load_from_state()
+        )
     elif isinstance(runnable_connector, PollConnector):
         start = datetime(1970, 1, 1, tzinfo=timezone.utc).timestamp()
         end = datetime.now(timezone.utc).timestamp()
-        doc_batch_generator = runnable_connector.poll_source(start=start, end=end)
+        doc_batch_id_generator = document_batch_to_ids(
+            runnable_connector.poll_source(start=start, end=end)
+        )
+    elif isinstance(runnable_connector, CheckpointedConnector):
+        start = datetime(1970, 1, 1, tzinfo=timezone.utc).timestamp()
+        end = datetime.now(timezone.utc).timestamp()
+        checkpoint = runnable_connector.build_dummy_checkpoint()
+        checkpoint_generator = runnable_connector.load_from_checkpoint(
+            start=start, end=end, checkpoint=checkpoint
+        )
+        doc_batch_id_generator = batched_doc_ids(
+            checkpoint_generator, batch_size=PRUNING_CHECKPOINTED_BATCH_SIZE
+        )
     else:
         raise RuntimeError("Pruning job could not find a valid runnable_connector.")
 
-    doc_batch_processing_func = document_batch_to_ids
+    # this function is called per batch for rate limiting
+    def doc_batch_processing_func(doc_batch_ids: set[str]) -> set[str]:
+        return doc_batch_ids
+
     if MAX_PRUNING_DOCUMENT_RETRIEVAL_PER_MINUTE:
         doc_batch_processing_func = rate_limit_builder(
             max_calls=MAX_PRUNING_DOCUMENT_RETRIEVAL_PER_MINUTE, period=60
-        )(document_batch_to_ids)
-    for doc_batch in doc_batch_generator:
+        )(lambda x: x)
+    for doc_batch_ids in doc_batch_id_generator:
         if callback:
             if callback.should_stop():
                 raise RuntimeError(
                     "extract_ids_from_runnable_connector: Stop signal detected"
                 )
 
-        all_connector_doc_ids.update(doc_batch_processing_func(doc_batch))
+        all_connector_doc_ids.update(doc_batch_processing_func(doc_batch_ids))
 
         if callback:
-            callback.progress("extract_ids_from_runnable_connector", len(doc_batch))
+            callback.progress("extract_ids_from_runnable_connector", len(doc_batch_ids))
 
     return all_connector_doc_ids
 
diff --git a/backend/onyx/connectors/connector_runner.py b/backend/onyx/connectors/connector_runner.py
index 5555a988837..b915ecafcd5 100644
--- a/backend/onyx/connectors/connector_runner.py
+++ b/backend/onyx/connectors/connector_runner.py
@@ -25,6 +25,28 @@
 CT = TypeVar("CT", bound=ConnectorCheckpoint)
 
 
+def batched_doc_ids(
+    checkpoint_connector_generator: CheckpointOutput[CT],
+    batch_size: int,
+) -> Generator[set[str], None, None]:
+    batch: set[str] = set()
+    for document, failure, next_checkpoint in CheckpointOutputWrapper[CT]()(
+        checkpoint_connector_generator
+    ):
+        if document is not None:
+            batch.add(document.id)
+        elif (
+            failure and failure.failed_document and failure.failed_document.document_id
+        ):
+            batch.add(failure.failed_document.document_id)
+
+        if len(batch) >= batch_size:
+            yield batch
+            batch = set()
+    if len(batch) > 0:
+        yield batch
+
+
 class CheckpointOutputWrapper(Generic[CT]):
     """
     Wraps a CheckpointOutput generator to give things back in a more digestible format,
diff --git a/backend/onyx/file_processing/file_validation.py b/backend/onyx/file_processing/file_validation.py
index 34f33dd2f55..0584bcd0831 100644
--- a/backend/onyx/file_processing/file_validation.py
+++ b/backend/onyx/file_processing/file_validation.py
@@ -32,9 +32,11 @@ def is_valid_image_type(mime_type: str) -> bool:
     Returns:
         True if the MIME type is a valid image type, False otherwise
     """
-    if not mime_type:
-        return False
-    return mime_type.startswith("image/") and mime_type not in EXCLUDED_IMAGE_TYPES
+    return (
+        bool(mime_type)
+        and mime_type.startswith("image/")
+        and mime_type not in EXCLUDED_IMAGE_TYPES
+    )
 
 
 def is_supported_by_vision_llm(mime_type: str) -> bool:
diff --git a/backend/onyx/file_store/utils.py b/backend/onyx/file_store/utils.py
index f4a041bf1a8..107504b33bb 100644
--- a/backend/onyx/file_store/utils.py
+++ b/backend/onyx/file_store/utils.py
@@ -46,7 +46,6 @@ def store_user_file_plaintext(user_file_id: int, plaintext_content: str) -> bool
     # Get plaintext file name
     plaintext_file_name = user_file_id_to_plaintext_file_name(user_file_id)
 
-    # Use a separate session to avoid committing the caller's transaction
     try:
         file_store = get_default_file_store()
         file_content = BytesIO(plaintext_content.encode("utf-8"))
diff --git a/backend/onyx/indexing/indexing_pipeline.py b/backend/onyx/indexing/indexing_pipeline.py
index 167f3a0c6a9..29592b7d3e2 100644
--- a/backend/onyx/indexing/indexing_pipeline.py
+++ b/backend/onyx/indexing/indexing_pipeline.py
@@ -867,31 +867,27 @@ def index_doc_batch(
         user_file_id_to_raw_text: dict[int, str] = {}
         for document_id in updatable_ids:
             # Only calculate token counts for documents that have a user file ID
-            if (
-                document_id in doc_id_to_user_file_id
-                and doc_id_to_user_file_id[document_id] is not None
-            ):
-                user_file_id = doc_id_to_user_file_id[document_id]
-                if not user_file_id:
-                    continue
-                document_chunks = [
-                    chunk
-                    for chunk in chunks_with_embeddings
-                    if chunk.source_document.id == document_id
-                ]
-                if document_chunks:
-                    combined_content = " ".join(
-                        [chunk.content for chunk in document_chunks]
-                    )
-                    token_count = (
-                        len(llm_tokenizer.encode(combined_content))
-                        if llm_tokenizer
-                        else 0
-                    )
-                    user_file_id_to_token_count[user_file_id] = token_count
-                    user_file_id_to_raw_text[user_file_id] = combined_content
-                else:
-                    user_file_id_to_token_count[user_file_id] = None
+
+            user_file_id = doc_id_to_user_file_id.get(document_id)
+            if user_file_id is None:
+                continue
+
+            document_chunks = [
+                chunk
+                for chunk in chunks_with_embeddings
+                if chunk.source_document.id == document_id
+            ]
+            if document_chunks:
+                combined_content = " ".join(
+                    [chunk.content for chunk in document_chunks]
+                )
+                token_count = (
+                    len(llm_tokenizer.encode(combined_content)) if llm_tokenizer else 0
+                )
+                user_file_id_to_token_count[user_file_id] = token_count
+                user_file_id_to_raw_text[user_file_id] = combined_content
+            else:
+                user_file_id_to_token_count[user_file_id] = None
 
         # we're concerned about race conditions where multiple simultaneous indexings might result
         # in one set of metadata overwriting another one in vespa.
diff --git a/backend/onyx/server/documents/connector.py b/backend/onyx/server/documents/connector.py
index eb6a3ca57b8..cb1bc0c6cd0 100644
--- a/backend/onyx/server/documents/connector.py
+++ b/backend/onyx/server/documents/connector.py
@@ -1,3 +1,4 @@
+import io
 import json
 import mimetypes
 import os
@@ -101,8 +102,9 @@
 from onyx.db.models import IndexingStatus
 from onyx.db.models import User
 from onyx.db.models import UserGroup__ConnectorCredentialPair
-from onyx.file_processing.extract_file_text import convert_docx_to_txt
+from onyx.file_processing.extract_file_text import extract_file_text
 from onyx.file_store.file_store import get_default_file_store
+from onyx.file_store.models import ChatFileType
 from onyx.key_value_store.interface import KvKeyNotFoundError
 from onyx.server.documents.models import AuthStatus
 from onyx.server.documents.models import AuthUrl
@@ -124,6 +126,7 @@
 from onyx.server.documents.models import ObjectCreationIdResponse
 from onyx.server.documents.models import RunConnectorRequest
 from onyx.server.models import StatusResponse
+from onyx.server.query_and_chat.chat_utils import mime_type_to_chat_file_type
 from onyx.utils.logger import setup_logger
 from onyx.utils.telemetry import create_milestone_and_report
 from onyx.utils.threadpool_concurrency import run_functions_tuples_in_parallel
@@ -438,7 +441,9 @@ def is_zip_file(file: UploadFile) -> bool:
     )
 
 
-def upload_files(files: list[UploadFile]) -> FileUploadResponse:
+def upload_files(
+    files: list[UploadFile], file_origin: FileOrigin = FileOrigin.CONNECTOR
+) -> FileUploadResponse:
     for file in files:
         if not file.filename:
             raise HTTPException(status_code=400, detail="File name cannot be empty")
@@ -487,12 +492,17 @@ def should_process_file(file_path: str) -> bool:
             # For mypy, actual check happens at start of function
             assert file.filename is not None
 
-            # Special handling for docx files - only store the plaintext version
-            if file.content_type and file.content_type.startswith(
-                "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
-            ):
-                docx_file_id = convert_docx_to_txt(file, file_store)
-                deduped_file_paths.append(docx_file_id)
+            # Special handling for doc files - only store the plaintext version
+            file_type = mime_type_to_chat_file_type(file.content_type)
+            if file_type == ChatFileType.DOC:
+                extracted_text = extract_file_text(file.file, file.filename or "")
+                text_file_id = file_store.save_file(
+                    content=io.BytesIO(extracted_text.encode()),
+                    display_name=file.filename,
+                    file_origin=file_origin,
+                    file_type="text/plain",
+                )
+                deduped_file_paths.append(text_file_id)
                 deduped_file_names.append(file.filename)
                 continue
 
@@ -520,7 +530,7 @@ def upload_files_api(
     files: list[UploadFile],
     _: User = Depends(current_curator_or_admin_user),
 ) -> FileUploadResponse:
-    return upload_files(files)
+    return upload_files(files, FileOrigin.OTHER)
 
 
 @router.get("/admin/connector")
diff --git a/backend/onyx/server/query_and_chat/chat_backend.py b/backend/onyx/server/query_and_chat/chat_backend.py
index a1e60f7af22..f95e49d21d4 100644
--- a/backend/onyx/server/query_and_chat/chat_backend.py
+++ b/backend/onyx/server/query_and_chat/chat_backend.py
@@ -1,6 +1,5 @@
 import asyncio
 import datetime
-import io
 import json
 import os
 import time
@@ -31,7 +30,6 @@
 from onyx.configs.app_configs import WEB_DOMAIN
 from onyx.configs.chat_configs import HARD_DELETE_CHATS
 from onyx.configs.constants import DocumentSource
-from onyx.configs.constants import FileOrigin
 from onyx.configs.constants import MessageType
 from onyx.configs.constants import MilestoneRecordType
 from onyx.configs.model_configs import LITELLM_PASS_THROUGH_HEADERS
@@ -63,9 +61,7 @@
 from onyx.db.persona import get_persona_by_id
 from onyx.db.user_documents import create_user_files
 from onyx.file_processing.extract_file_text import docx_to_txt_filename
-from onyx.file_processing.extract_file_text import extract_file_text
 from onyx.file_store.file_store import get_default_file_store
-from onyx.file_store.models import ChatFileType
 from onyx.file_store.models import FileDescriptor
 from onyx.llm.exceptions import GenAIDisabledException
 from onyx.llm.factory import get_default_llms
@@ -717,106 +713,65 @@ def upload_files_for_chat(
         ):
             raise HTTPException(
                 status_code=400,
-                detail="File size must be less than 20MB",
+                detail="Images must be less than 20MB",
             )
 
-    file_store = get_default_file_store()
-
-    file_info: list[tuple[str, str | None, ChatFileType]] = []
-    for file in files:
-        file_type = mime_type_to_chat_file_type(file.content_type)
-
-        file_content = file.file.read()  # Read the file content
-
-        # NOTE: Image conversion to JPEG used to be enforced here.
-        # This was removed to:
-        # 1. Preserve original file content for downloads
-        # 2. Maintain transparency in formats like PNG
-        # 3. Ameliorate issue with file conversion
-        file_content_io = io.BytesIO(file_content)
-
-        new_content_type = file.content_type
-
-        # Store the file normally
-        file_id = file_store.save_file(
-            content=file_content_io,
-            display_name=file.filename,
-            file_origin=FileOrigin.CHAT_UPLOAD,
-            file_type=new_content_type or file_type.value,
+    # 5) Create a user file for each uploaded file
+    user_files = create_user_files(files, RECENT_DOCS_FOLDER_ID, user, db_session)
+    for user_file in user_files:
+        # 6) Create connector
+        connector_base = ConnectorBase(
+            name=f"UserFile-{int(time.time())}",
+            source=DocumentSource.FILE,
+            input_type=InputType.LOAD_STATE,
+            connector_specific_config={
+                "file_locations": [user_file.file_id],
+                "file_names": [user_file.name],
+                "zip_metadata": {},
+            },
+            refresh_freq=None,
+            prune_freq=None,
+            indexing_start=None,
+        )
+        connector = create_connector(
+            db_session=db_session,
+            connector_data=connector_base,
         )
 
-        # 4) If the file is a doc, extract text and store that separately
-        if file_type == ChatFileType.DOC:
-            # Re-wrap bytes in a fresh BytesIO so we start at position 0
-            extracted_text_io = io.BytesIO(file_content)
-            extracted_text = extract_file_text(
-                file=extracted_text_io,  # use the bytes we already read
-                file_name=file.filename or "",
-            )
-
-            text_file_id = file_store.save_file(
-                content=io.BytesIO(extracted_text.encode()),
-                display_name=file.filename,
-                file_origin=FileOrigin.CHAT_UPLOAD,
-                file_type="text/plain",
-            )
-            # Return the text file as the "main" file descriptor for doc types
-            file_info.append((text_file_id, file.filename, ChatFileType.PLAIN_TEXT))
-        else:
-            file_info.append((file_id, file.filename, file_type))
-
-        # 5) Create a user file for each uploaded file
-        user_files = create_user_files([file], RECENT_DOCS_FOLDER_ID, user, db_session)
-        for user_file in user_files:
-            # 6) Create connector
-            connector_base = ConnectorBase(
-                name=f"UserFile-{int(time.time())}",
-                source=DocumentSource.FILE,
-                input_type=InputType.LOAD_STATE,
-                connector_specific_config={
-                    "file_locations": [user_file.file_id],
-                    "file_names": [user_file.name],
-                    "zip_metadata": {},
-                },
-                refresh_freq=None,
-                prune_freq=None,
-                indexing_start=None,
-            )
-            connector = create_connector(
-                db_session=db_session,
-                connector_data=connector_base,
-            )
-
-            # 7) Create credential
-            credential_info = CredentialBase(
-                credential_json={},
-                admin_public=True,
-                source=DocumentSource.FILE,
-                curator_public=True,
-                groups=[],
-                name=f"UserFileCredential-{int(time.time())}",
-                is_user_file=True,
-            )
-            credential = create_credential(credential_info, user, db_session)
+        # 7) Create credential
+        credential_info = CredentialBase(
+            credential_json={},
+            admin_public=True,
+            source=DocumentSource.FILE,
+            curator_public=True,
+            groups=[],
+            name=f"UserFileCredential-{int(time.time())}",
+            is_user_file=True,
+        )
+        credential = create_credential(credential_info, user, db_session)
 
-            # 8) Create connector credential pair
-            cc_pair = add_credential_to_connector(
-                db_session=db_session,
-                user=user,
-                connector_id=connector.id,
-                credential_id=credential.id,
-                cc_pair_name=f"UserFileCCPair-{int(time.time())}",
-                access_type=AccessType.PRIVATE,
-                auto_sync_options=None,
-                groups=[],
-            )
-            user_file.cc_pair_id = cc_pair.data
-            db_session.commit()
+        # 8) Create connector credential pair
+        cc_pair = add_credential_to_connector(
+            db_session=db_session,
+            user=user,
+            connector_id=connector.id,
+            credential_id=credential.id,
+            cc_pair_name=f"UserFileCCPair-{int(time.time())}",
+            access_type=AccessType.PRIVATE,
+            auto_sync_options=None,
+            groups=[],
+        )
+        user_file.cc_pair_id = cc_pair.data
+        db_session.commit()
 
     return {
         "files": [
-            {"id": file_id, "type": file_type, "name": file_name}
-            for file_id, file_name, file_type in file_info
+            {
+                "id": user_file.file_id,
+                "type": mime_type_to_chat_file_type(user_file.content_type),
+                "name": user_file.name,
+            }
+            for user_file in user_files
         ]
     }
 

From dda8c8b7bf2516fb8bd6ef14e1faa0dcb81181ae Mon Sep 17 00:00:00 2001
From: Wenxi <wenxi@onyx.app>
Date: Thu, 7 Aug 2025 19:10:00 -0700
Subject: [PATCH 69/78] fix: sanitize slack payload before logging (#5167)

* sanitize slack payload before logging

* nit
---
 backend/onyx/onyxbot/slack/listener.py | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/backend/onyx/onyxbot/slack/listener.py b/backend/onyx/onyxbot/slack/listener.py
index 97829bd3e96..a3e4416f0cb 100644
--- a/backend/onyx/onyxbot/slack/listener.py
+++ b/backend/onyx/onyxbot/slack/listener.py
@@ -130,6 +130,10 @@
 # This is always (currently) the user id of Slack's official slackbot
 _OFFICIAL_SLACKBOT_USER_ID = "USLACKBOT"
 
+# Fields to exclude from Slack payload logging
+# Intention is to not log slack message content
+_EXCLUDED_SLACK_PAYLOAD_FIELDS = {"text", "blocks"}
+
 
 class SlackbotHandler:
     def __init__(self) -> None:
@@ -570,6 +574,20 @@ def shutdown(self, signum: int | None, frame: FrameType | None) -> None:
         sys.exit(0)
 
 
+def sanitize_slack_payload(payload: dict) -> dict:
+    """Remove message content from Slack payload for logging"""
+    sanitized = {
+        k: v for k, v in payload.items() if k not in _EXCLUDED_SLACK_PAYLOAD_FIELDS
+    }
+    if "event" in sanitized and isinstance(sanitized["event"], dict):
+        sanitized["event"] = {
+            k: v
+            for k, v in sanitized["event"].items()
+            if k not in _EXCLUDED_SLACK_PAYLOAD_FIELDS
+        }
+    return sanitized
+
+
 def prefilter_requests(req: SocketModeRequest, client: TenantSocketModeClient) -> bool:
     """True to keep going, False to ignore this Slack request"""
 
@@ -762,7 +780,10 @@ def prefilter_requests(req: SocketModeRequest, client: TenantSocketModeClient) -
     if not check_message_limit():
         return False
 
-    logger.debug(f"Handling Slack request: {client.bot_name=} '{req.payload=}'")
+    # Don't log Slack message content
+    logger.debug(
+        f"Handling Slack request: {client.bot_name=} '{sanitize_slack_payload(req.payload)=}'"
+    )
     return True
 
 
@@ -929,10 +950,9 @@ def process_message(
     if req.type == "events_api":
         event = cast(dict[str, Any], req.payload["event"])
         event_type = event.get("type")
-        msg = cast(str, event.get("text", ""))
         logger.info(
             f"process_message start: {tenant_id=} {req.type=} {req.envelope_id=} "
-            f"{event_type=} {msg=}"
+            f"{event_type=}"
         )
     else:
         logger.info(

From d91fe1193b3868f2e6c237e227ceadcd4992106d Mon Sep 17 00:00:00 2001
From: Wenxi <wenxi@onyx.app>
Date: Fri, 8 Aug 2025 16:58:47 -0700
Subject: [PATCH 70/78] add gpt 5 display names (#5175)

---
 web/src/lib/hooks.ts | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/web/src/lib/hooks.ts b/web/src/lib/hooks.ts
index b37bb50a689..ca9fc67d01d 100644
--- a/web/src/lib/hooks.ts
+++ b/web/src/lib/hooks.ts
@@ -679,6 +679,8 @@ const MODEL_DISPLAY_NAMES: { [key: string]: string } = {
   "o1-mini": "o1 Mini",
   "o1-preview": "o1 Preview",
   o1: "o1",
+  "gpt-5": "GPT 5",
+  "gpt-5-mini": "GPT 5 Mini",
   "gpt-4.1": "GPT 4.1",
   "gpt-4": "GPT 4",
   "gpt-4o": "GPT 4o",

From 66c02dd729a6424a11802e0cd82aef50a4b11de6 Mon Sep 17 00:00:00 2001
From: Justin Tahara <105671973+justin-tahara@users.noreply.github.com>
Date: Fri, 8 Aug 2025 16:23:55 -0700
Subject: [PATCH 71/78] fix(infra): Update github workflow to not tag latest
 (#5172)

* fix(infra): Update github workflow to not tag latest

* Cleaned up the code a bit
---
 .github/workflows/helm-chart-releases.yml | 30 ++++++++++++++---------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/helm-chart-releases.yml b/.github/workflows/helm-chart-releases.yml
index 442148f8aa4..ae4cd50751b 100644
--- a/.github/workflows/helm-chart-releases.yml
+++ b/.github/workflows/helm-chart-releases.yml
@@ -18,25 +18,33 @@ jobs:
         with:
           fetch-depth: 0
 
-      - name: Configure Git
-        run: |
-          git config user.name "$GITHUB_ACTOR"
-          git config user.email "$GITHUB_ACTOR@users.noreply.github.com"
-
-      - name: Install Helm
+      - name: Install Helm CLI
         uses: azure/setup-helm@v4
         with:
           version: v3.12.1
 
-      - name: Add Required Helm Repositories
+      - name: Add required Helm repositories
         run: |
           helm repo add bitnami https://charts.bitnami.com/bitnami
           helm repo add onyx-vespa https://onyx-dot-app.github.io/vespa-helm-charts
           helm repo update
 
-      - name: Run chart-releaser
-        uses: helm/chart-releaser-action@v1.7.0
+      - name: Build chart dependencies
+        run: |
+          set -euo pipefail
+          for chart_dir in deployment/helm/charts/*; do
+            if [ -f "$chart_dir/Chart.yaml" ]; then
+              echo "Building dependencies for $chart_dir"
+              helm dependency build "$chart_dir"
+            fi
+          done
+
+      - name: Publish Helm charts to gh-pages
+        uses: stefanprodan/helm-gh-pages@v1.7.0
         with:
+          token: ${{ secrets.GITHUB_TOKEN }}
           charts_dir: deployment/helm/charts
-        env:
-          CR_TOKEN: "${{ secrets.GITHUB_TOKEN }}"
\ No newline at end of file
+          branch: gh-pages
+          helm_version: v3.12.1
+          commit_username: ${{ github.actor }}
+          commit_email: ${{ github.actor }}@users.noreply.github.com
\ No newline at end of file

From 0f97f77c1fa2916d3650a2ecf21109cc7122cce0 Mon Sep 17 00:00:00 2001
From: Justin Tahara <105671973+justin-tahara@users.noreply.github.com>
Date: Fri, 8 Aug 2025 18:40:55 -0700
Subject: [PATCH 72/78] fix(infra): Removing invalid helm version (#5176)

---
 .github/workflows/helm-chart-releases.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/helm-chart-releases.yml b/.github/workflows/helm-chart-releases.yml
index ae4cd50751b..358e05c9708 100644
--- a/.github/workflows/helm-chart-releases.yml
+++ b/.github/workflows/helm-chart-releases.yml
@@ -45,6 +45,5 @@ jobs:
           token: ${{ secrets.GITHUB_TOKEN }}
           charts_dir: deployment/helm/charts
           branch: gh-pages
-          helm_version: v3.12.1
           commit_username: ${{ github.actor }}
           commit_email: ${{ github.actor }}@users.noreply.github.com
\ No newline at end of file

From ce840d8c4087f5fcaf963a906ea99e940e57e7f6 Mon Sep 17 00:00:00 2001
From: SubashMohan <subashmohan75@gmail.com>
Date: Mon, 11 Aug 2025 00:30:18 +0530
Subject: [PATCH 73/78] fix: restrict user file access to current user only
 (#5177)

* fix: restrict user file access to current user only

* fix: enhance user file access control for recent folder
---
 backend/onyx/file_store/utils.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/backend/onyx/file_store/utils.py b/backend/onyx/file_store/utils.py
index 107504b33bb..d11b9864ecc 100644
--- a/backend/onyx/file_store/utils.py
+++ b/backend/onyx/file_store/utils.py
@@ -22,6 +22,8 @@
 
 logger = setup_logger()
 
+RECENT_FOLDER_ID = -1
+
 
 def user_file_id_to_plaintext_file_name(user_file_id: int) -> str:
     """Generate a consistent file name for storing plaintext content of a user file."""
@@ -244,14 +246,21 @@ def get_user_files_as_user(
     Fetches all UserFile database records for a given user.
     """
     user_files = get_user_files(user_file_ids, user_folder_ids, db_session)
+    current_user_files = []
     for user_file in user_files:
         # Note: if user_id is None, then all files should be None as well
         # (since auth must be disabled in this case)
-        if user_file.user_id != user_id:
-            raise ValueError(
-                f"User {user_id} does not have access to file {user_file.id}"
-            )
-    return user_files
+        if user_file.folder_id == RECENT_FOLDER_ID:
+            if user_file.user_id == user_id:
+                current_user_files.append(user_file)
+        else:
+            if user_file.user_id != user_id:
+                raise ValueError(
+                    f"User {user_id} does not have access to file {user_file.id}"
+                )
+            current_user_files.append(user_file)
+
+    return current_user_files
 
 
 def save_file_from_url(url: str) -> str:

From 3b07ee9942ffcfc924bc0bce2ee7479f5f80e1ac Mon Sep 17 00:00:00 2001
From: Rei Meguro <36625832+Orbital-Web@users.noreply.github.com>
Date: Sun, 10 Aug 2025 15:39:33 -0700
Subject: [PATCH 74/78] fix: removal of old tags + is_list differentiation 
 (#5147)

* initial migration

* getting metadata from tags

* complete migration

* migration override for cloud

* fix: more robust structured tag gen

* tag and indexing update

* fix: move is_list to tags

* migration rebase

* test cases + bugfix on unique constraint

* fix logging
---
 .../alembic/versions/90e3b9af7da4_tag_fix.py  | 341 ++++++++++++++++++
 backend/onyx/db/models.py                     |   7 +-
 backend/onyx/db/tag.py                        |  84 ++++-
 backend/onyx/indexing/indexing_pipeline.py    |  27 +-
 .../tests/integration/tests/tags/test_tags.py | 220 +++++++++++
 5 files changed, 644 insertions(+), 35 deletions(-)
 create mode 100644 backend/alembic/versions/90e3b9af7da4_tag_fix.py
 create mode 100644 backend/tests/integration/tests/tags/test_tags.py

diff --git a/backend/alembic/versions/90e3b9af7da4_tag_fix.py b/backend/alembic/versions/90e3b9af7da4_tag_fix.py
new file mode 100644
index 00000000000..0050942e217
--- /dev/null
+++ b/backend/alembic/versions/90e3b9af7da4_tag_fix.py
@@ -0,0 +1,341 @@
+"""tag-fix
+
+Revision ID: 90e3b9af7da4
+Revises: 62c3a055a141
+Create Date: 2025-08-01 20:58:14.607624
+
+"""
+
+import json
+import logging
+import os
+
+from typing import cast
+from typing import Generator
+
+from alembic import op
+import sqlalchemy as sa
+
+from onyx.document_index.factory import get_default_document_index
+from onyx.document_index.vespa_constants import DOCUMENT_ID_ENDPOINT
+from onyx.db.search_settings import SearchSettings
+from onyx.configs.app_configs import AUTH_TYPE
+from onyx.configs.constants import AuthType
+from onyx.document_index.vespa.shared_utils.utils import get_vespa_http_client
+
+logger = logging.getLogger("alembic.runtime.migration")
+
+
+# revision identifiers, used by Alembic.
+revision = "90e3b9af7da4"
+down_revision = "62c3a055a141"
+branch_labels = None
+depends_on = None
+
+SKIP_TAG_FIX = os.environ.get("SKIP_TAG_FIX", "true").lower() == "true"
+
+# override for cloud
+if AUTH_TYPE == AuthType.CLOUD:
+    SKIP_TAG_FIX = True
+
+
+def set_is_list_for_known_tags() -> None:
+    """
+    Sets is_list to true for all tags that are known to be lists.
+    """
+    LIST_METADATA: list[tuple[str, str]] = [
+        ("CLICKUP", "tags"),
+        ("CONFLUENCE", "labels"),
+        ("DISCOURSE", "tags"),
+        ("FRESHDESK", "emails"),
+        ("GITHUB", "assignees"),
+        ("GITHUB", "labels"),
+        ("GURU", "tags"),
+        ("GURU", "folders"),
+        ("HUBSPOT", "associated_contact_ids"),
+        ("HUBSPOT", "associated_company_ids"),
+        ("HUBSPOT", "associated_deal_ids"),
+        ("HUBSPOT", "associated_ticket_ids"),
+        ("JIRA", "labels"),
+        ("MEDIAWIKI", "categories"),
+        ("ZENDESK", "labels"),
+        ("ZENDESK", "content_tags"),
+    ]
+
+    bind = op.get_bind()
+    for source, key in LIST_METADATA:
+        bind.execute(
+            sa.text(
+                f"""
+                UPDATE tag
+                SET is_list = true
+                WHERE tag_key = '{key}'
+                AND source = '{source}'
+                """
+            )
+        )
+
+
+def set_is_list_for_list_tags() -> None:
+    """
+    Sets is_list to true for all tags which have multiple values for a given
+    document, key, and source triplet. This only works if we remove old tags
+    from the database.
+    """
+    bind = op.get_bind()
+    bind.execute(
+        sa.text(
+            """
+            UPDATE tag
+            SET is_list = true
+            FROM (
+                SELECT DISTINCT tag.tag_key, tag.source
+                FROM tag
+                JOIN document__tag ON tag.id = document__tag.tag_id
+                GROUP BY tag.tag_key, tag.source, document__tag.document_id
+                HAVING count(*) > 1
+            ) AS list_tags
+            WHERE tag.tag_key = list_tags.tag_key
+            AND tag.source = list_tags.source
+            """
+        )
+    )
+
+
+def log_list_tags() -> None:
+    bind = op.get_bind()
+    result = bind.execute(
+        sa.text(
+            """
+            SELECT DISTINCT source, tag_key
+            FROM tag
+            WHERE is_list
+            ORDER BY source, tag_key
+            """
+        )
+    ).fetchall()
+    logger.info(
+        "List tags:\n" + "\n".join(f"  {source}: {key}" for source, key in result)
+    )
+
+
+def remove_old_tags() -> None:
+    """
+    Removes old tags from the database.
+    Previously, there was a bug where if a document got indexed with a tag and then
+    the document got reindexed, the old tag would not be removed.
+    This function removes those old tags by comparing it against the tags in vespa.
+    """
+    current_search_settings, future_search_settings = active_search_settings()
+    document_index = get_default_document_index(
+        current_search_settings, future_search_settings
+    )
+
+    # Get the index name
+    if hasattr(document_index, "index_name"):
+        index_name = document_index.index_name
+    else:
+        # Default index name if we can't get it from the document_index
+        index_name = "danswer_index"
+
+    for batch in _get_batch_documents_with_multiple_tags():
+        n_deleted = 0
+
+        for document_id in batch:
+            true_metadata = _get_vespa_metadata(document_id, index_name)
+            tags = _get_document_tags(document_id)
+
+            # identify document__tags to delete
+            to_delete: list[str] = []
+            for tag_id, tag_key, tag_value in tags:
+                true_val = true_metadata.get(tag_key, "")
+                if (isinstance(true_val, list) and tag_value not in true_val) or (
+                    isinstance(true_val, str) and tag_value != true_val
+                ):
+                    to_delete.append(str(tag_id))
+
+            if not to_delete:
+                continue
+
+            # delete old document__tags
+            bind = op.get_bind()
+            result = bind.execute(
+                sa.text(
+                    f"""
+                    DELETE FROM document__tag
+                    WHERE document_id = '{document_id}'
+                    AND tag_id IN ({','.join(to_delete)})
+                    """
+                )
+            )
+            n_deleted += result.rowcount
+        logger.info(f"Processed {len(batch)} documents and deleted {n_deleted} tags")
+
+
+def active_search_settings() -> tuple[SearchSettings, SearchSettings | None]:
+    result = op.get_bind().execute(
+        sa.text(
+            """
+        SELECT * FROM search_settings WHERE status = 'PRESENT' ORDER BY id DESC LIMIT 1
+        """
+        )
+    )
+    search_settings_fetch = result.fetchall()
+    search_settings = (
+        SearchSettings(**search_settings_fetch[0]._asdict())
+        if search_settings_fetch
+        else None
+    )
+
+    result2 = op.get_bind().execute(
+        sa.text(
+            """
+        SELECT * FROM search_settings WHERE status = 'FUTURE' ORDER BY id DESC LIMIT 1
+        """
+        )
+    )
+    search_settings_future_fetch = result2.fetchall()
+    search_settings_future = (
+        SearchSettings(**search_settings_future_fetch[0]._asdict())
+        if search_settings_future_fetch
+        else None
+    )
+
+    if not isinstance(search_settings, SearchSettings):
+        raise RuntimeError(
+            "current search settings is of type " + str(type(search_settings))
+        )
+    if (
+        not isinstance(search_settings_future, SearchSettings)
+        and search_settings_future is not None
+    ):
+        raise RuntimeError(
+            "future search settings is of type " + str(type(search_settings_future))
+        )
+
+    return search_settings, search_settings_future
+
+
+def _get_batch_documents_with_multiple_tags(
+    batch_size: int = 128,
+) -> Generator[list[str], None, None]:
+    """
+    Returns a list of document ids which contain a one to many tag.
+    The document may either contain a list metadata value, or may contain leftover
+    old tags from reindexing.
+    """
+    offset_clause = ""
+    bind = op.get_bind()
+
+    while True:
+        batch = bind.execute(
+            sa.text(
+                f"""
+                SELECT DISTINCT document__tag.document_id
+                FROM tag
+                JOIN document__tag ON tag.id = document__tag.tag_id
+                GROUP BY tag.tag_key, tag.source, document__tag.document_id
+                HAVING count(*) > 1 {offset_clause}
+                ORDER BY document__tag.document_id
+                LIMIT {batch_size}
+                """
+            )
+        ).fetchall()
+        if not batch:
+            break
+        doc_ids = [document_id for document_id, in batch]
+        yield doc_ids
+        offset_clause = f"AND document__tag.document_id > '{doc_ids[-1]}'"
+
+
+def _get_vespa_metadata(
+    document_id: str, index_name: str
+) -> dict[str, str | list[str]]:
+    url = DOCUMENT_ID_ENDPOINT.format(index_name=index_name)
+
+    # Document-Selector language
+    selection = (
+        f"{index_name}.document_id=='{document_id}' and {index_name}.chunk_id==0"
+    )
+
+    params: dict[str, str | int] = {
+        "selection": selection,
+        "wantedDocumentCount": 1,
+        "fieldSet": f"{index_name}:metadata",
+    }
+
+    with get_vespa_http_client() as client:
+        resp = client.get(url, params=params)
+        resp.raise_for_status()
+
+    docs = resp.json().get("documents", [])
+    if not docs:
+        raise RuntimeError(f"No chunk-0 found for document {document_id}")
+
+    # for some reason, metadata is a string
+    metadata = docs[0]["fields"]["metadata"]
+    return json.loads(metadata)
+
+
+def _get_document_tags(document_id: str) -> list[tuple[int, str, str]]:
+    bind = op.get_bind()
+    result = bind.execute(
+        sa.text(
+            f"""
+            SELECT tag.id, tag.tag_key, tag.tag_value
+            FROM tag
+            JOIN document__tag ON tag.id = document__tag.tag_id
+            WHERE document__tag.document_id = '{document_id}'
+            """
+        )
+    ).fetchall()
+    return cast(list[tuple[int, str, str]], result)
+
+
+def upgrade() -> None:
+    op.add_column(
+        "tag",
+        sa.Column("is_list", sa.Boolean(), nullable=False, server_default="false"),
+    )
+    op.drop_constraint(
+        constraint_name="_tag_key_value_source_uc",
+        table_name="tag",
+        type_="unique",
+    )
+    op.create_unique_constraint(
+        constraint_name="_tag_key_value_source_list_uc",
+        table_name="tag",
+        columns=["tag_key", "tag_value", "source", "is_list"],
+    )
+    set_is_list_for_known_tags()
+
+    if SKIP_TAG_FIX:
+        logger.warning(
+            "Skipping removal of old tags. "
+            "This can cause issues when using the knowledge graph, or "
+            "when filtering for documents by tags."
+        )
+        log_list_tags()
+        return
+
+    remove_old_tags()
+    set_is_list_for_list_tags()
+
+    # debug
+    log_list_tags()
+
+
+def downgrade() -> None:
+    # the migration adds and populates the is_list column, and removes old bugged tags
+    # there isn't a point in adding back the bugged tags, so we just drop the column
+    op.drop_constraint(
+        constraint_name="_tag_key_value_source_list_uc",
+        table_name="tag",
+        type_="unique",
+    )
+    op.create_unique_constraint(
+        constraint_name="_tag_key_value_source_uc",
+        table_name="tag",
+        columns=["tag_key", "tag_value", "source"],
+    )
+    op.drop_column("tag", "is_list")
diff --git a/backend/onyx/db/models.py b/backend/onyx/db/models.py
index 1842571949a..0f96d6402a3 100644
--- a/backend/onyx/db/models.py
+++ b/backend/onyx/db/models.py
@@ -1293,6 +1293,7 @@ class Tag(Base):
     source: Mapped[DocumentSource] = mapped_column(
         Enum(DocumentSource, native_enum=False)
     )
+    is_list: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
 
     documents = relationship(
         "Document",
@@ -1302,7 +1303,11 @@ class Tag(Base):
 
     __table_args__ = (
         UniqueConstraint(
-            "tag_key", "tag_value", "source", name="_tag_key_value_source_uc"
+            "tag_key",
+            "tag_value",
+            "source",
+            "is_list",
+            name="_tag_key_value_source_list_uc",
         ),
     )
 
diff --git a/backend/onyx/db/tag.py b/backend/onyx/db/tag.py
index 425fbd58f25..127a157c5c0 100644
--- a/backend/onyx/db/tag.py
+++ b/backend/onyx/db/tag.py
@@ -47,11 +47,12 @@ def create_or_add_document_tag(
         Tag.tag_key == tag_key,
         Tag.tag_value == tag_value,
         Tag.source == source,
+        Tag.is_list.is_(False),
     )
     tag = db_session.execute(tag_stmt).scalar_one_or_none()
 
     if not tag:
-        tag = Tag(tag_key=tag_key, tag_value=tag_value, source=source)
+        tag = Tag(tag_key=tag_key, tag_value=tag_value, source=source, is_list=False)
         db_session.add(tag)
 
     if tag not in document.tags:
@@ -82,6 +83,7 @@ def create_or_add_document_tag_list(
         Tag.tag_key == tag_key,
         Tag.tag_value.in_(valid_tag_values),
         Tag.source == source,
+        Tag.is_list.is_(True),
     )
     existing_tags = list(db_session.execute(existing_tags_stmt).scalars().all())
     existing_tag_values = {tag.tag_value for tag in existing_tags}
@@ -89,7 +91,9 @@ def create_or_add_document_tag_list(
     new_tags = []
     for tag_value in valid_tag_values:
         if tag_value not in existing_tag_values:
-            new_tag = Tag(tag_key=tag_key, tag_value=tag_value, source=source)
+            new_tag = Tag(
+                tag_key=tag_key, tag_value=tag_value, source=source, is_list=True
+            )
             db_session.add(new_tag)
             new_tags.append(new_tag)
             existing_tag_values.add(tag_value)
@@ -109,6 +113,45 @@ def create_or_add_document_tag_list(
     return all_tags
 
 
+def upsert_document_tags(
+    document_id: str,
+    source: DocumentSource,
+    metadata: dict[str, str | list[str]],
+    db_session: Session,
+) -> list[Tag]:
+    document = db_session.get(Document, document_id)
+    if not document:
+        raise ValueError("Invalid Document, cannot attach Tags")
+
+    old_tag_ids: set[int] = {tag.id for tag in document.tags}
+
+    new_tags: list[Tag] = []
+    new_tag_ids: set[int] = set()
+    for k, v in metadata.items():
+        if isinstance(v, list):
+            new_tags.extend(
+                create_or_add_document_tag_list(k, v, source, document_id, db_session)
+            )
+            new_tag_ids.update({tag.id for tag in new_tags})
+            continue
+
+        new_tag = create_or_add_document_tag(k, v, source, document_id, db_session)
+        if new_tag:
+            new_tag_ids.add(new_tag.id)
+            new_tags.append(new_tag)
+
+    delete_tags = old_tag_ids - new_tag_ids
+    if delete_tags:
+        delete_stmt = delete(Document__Tag).where(
+            Document__Tag.document_id == document_id,
+            Document__Tag.tag_id.in_(delete_tags),
+        )
+        db_session.execute(delete_stmt)
+        db_session.commit()
+
+    return new_tags
+
+
 def find_tags(
     tag_key_prefix: str | None,
     tag_value_prefix: str | None,
@@ -147,24 +190,37 @@ def find_tags(
 def get_structured_tags_for_document(
     document_id: str, db_session: Session
 ) -> dict[str, str | list[str]]:
+    """Essentially returns the document metadata from postgres."""
     document = db_session.get(Document, document_id)
     if not document:
         raise ValueError("Invalid Document, cannot find tags")
 
     document_metadata: dict[str, Any] = {}
     for tag in document.tags:
-        if tag.tag_key in document_metadata:
-            # NOTE: we convert to list if there are multiple values for the same key
-            # Thus, it won't know if a tag is a list if it only contains one value
-            if isinstance(document_metadata[tag.tag_key], str):
-                document_metadata[tag.tag_key] = [
-                    document_metadata[tag.tag_key],
-                    tag.tag_value,
-                ]
-            else:
-                document_metadata[tag.tag_key].append(tag.tag_value)
-        else:
-            document_metadata[tag.tag_key] = tag.tag_value
+        if tag.is_list:
+            document_metadata.setdefault(tag.tag_key, [])
+            # should always be a list (if tag.is_list is always True for this key), but just in case
+            if not isinstance(document_metadata[tag.tag_key], list):
+                logger.warning(
+                    "Inconsistent is_list for document %s, tag_key %s",
+                    document_id,
+                    tag.tag_key,
+                )
+                document_metadata[tag.tag_key] = [document_metadata[tag.tag_key]]
+            document_metadata[tag.tag_key].append(tag.tag_value)
+            continue
+
+        # set value (ignore duplicate keys, though there should be none)
+        document_metadata.setdefault(tag.tag_key, tag.tag_value)
+
+        # should always be a value, but just in case (treat it as a list in this case)
+        if isinstance(document_metadata[tag.tag_key], list):
+            logger.warning(
+                "Inconsistent is_list for document %s, tag_key %s",
+                document_id,
+                tag.tag_key,
+            )
+            document_metadata[tag.tag_key] = [document_metadata[tag.tag_key]]
     return document_metadata
 
 
diff --git a/backend/onyx/indexing/indexing_pipeline.py b/backend/onyx/indexing/indexing_pipeline.py
index 29592b7d3e2..a05fdc370a9 100644
--- a/backend/onyx/indexing/indexing_pipeline.py
+++ b/backend/onyx/indexing/indexing_pipeline.py
@@ -44,8 +44,7 @@
 from onyx.db.models import Document as DBDocument
 from onyx.db.models import IndexModelStatus
 from onyx.db.search_settings import get_active_search_settings
-from onyx.db.tag import create_or_add_document_tag
-from onyx.db.tag import create_or_add_document_tag_list
+from onyx.db.tag import upsert_document_tags
 from onyx.db.user_documents import fetch_user_files_for_documents
 from onyx.db.user_documents import fetch_user_folders_for_documents
 from onyx.db.user_documents import update_user_file_token_count__no_commit
@@ -150,24 +149,12 @@ def _upsert_documents_in_db(
 
     # Insert document content metadata
     for doc in documents:
-        for k, v in doc.metadata.items():
-            if isinstance(v, list):
-                create_or_add_document_tag_list(
-                    tag_key=k,
-                    tag_values=v,
-                    source=doc.source,
-                    document_id=doc.id,
-                    db_session=db_session,
-                )
-                continue
-
-            create_or_add_document_tag(
-                tag_key=k,
-                tag_value=v,
-                source=doc.source,
-                document_id=doc.id,
-                db_session=db_session,
-            )
+        upsert_document_tags(
+            document_id=doc.id,
+            source=doc.source,
+            metadata=doc.metadata,
+            db_session=db_session,
+        )
 
 
 def _get_aggregated_chunk_boost_factor(
diff --git a/backend/tests/integration/tests/tags/test_tags.py b/backend/tests/integration/tests/tags/test_tags.py
new file mode 100644
index 00000000000..e1845bc05c6
--- /dev/null
+++ b/backend/tests/integration/tests/tags/test_tags.py
@@ -0,0 +1,220 @@
+from onyx.configs.constants import DocumentSource
+from onyx.connectors.models import InputType
+from onyx.db.engine.sql_engine import get_session_with_current_tenant
+from onyx.db.models import Document
+from onyx.db.tag import get_structured_tags_for_document
+from tests.integration.common_utils.managers.api_key import APIKeyManager
+from tests.integration.common_utils.managers.cc_pair import CCPairManager
+from tests.integration.common_utils.managers.document import DocumentManager
+from tests.integration.common_utils.managers.llm_provider import LLMProviderManager
+from tests.integration.common_utils.managers.user import UserManager
+from tests.integration.common_utils.test_models import DATestUser
+
+
+def test_tag_creation_and_update(reset: None) -> None:
+    # create admin user
+    admin_user: DATestUser = UserManager.create(email="admin@onyx.app")
+
+    # create a minimal file connector
+    cc_pair = CCPairManager.create_from_scratch(
+        name="KG-Test-FileConnector",
+        source=DocumentSource.FILE,
+        input_type=InputType.LOAD_STATE,
+        connector_specific_config={
+            "file_locations": [],
+            "file_names": [],
+            "zip_metadata": {},
+        },
+        user_performing_action=admin_user,
+    )
+    api_key = APIKeyManager.create(user_performing_action=admin_user)
+    api_key.headers.update(admin_user.headers)
+    LLMProviderManager.create(user_performing_action=admin_user)
+
+    # create document
+    doc1_expected_metadata: dict[str, str | list[str]] = {
+        "value": "val",
+        "multiple_list": ["a", "b", "c"],
+        "single_list": ["x"],
+    }
+    doc1_expected_tags: set[tuple[str, str, bool]] = {
+        ("value", "val", False),
+        ("multiple_list", "a", True),
+        ("multiple_list", "b", True),
+        ("multiple_list", "c", True),
+        ("single_list", "x", True),
+    }
+    doc1 = DocumentManager.seed_doc_with_content(
+        cc_pair=cc_pair,
+        content="Dummy content",
+        document_id="doc1",
+        metadata=doc1_expected_metadata,
+        api_key=api_key,
+    )
+
+    # these are added by the connector
+    doc1_expected_metadata["document_id"] = "doc1"
+    doc1_expected_tags.add(("document_id", "doc1", False))
+
+    # get document from db
+    with get_session_with_current_tenant() as db_session:
+        doc1_db = db_session.query(Document).filter(Document.id == doc1.id).first()
+        assert doc1_db is not None
+        assert doc1_db.id == doc1.id
+
+        doc1_tags = doc1_db.tags
+
+    # check tags
+    doc1_tags_data: set[tuple[str, str, bool]] = {
+        (tag.tag_key, tag.tag_value, tag.is_list) for tag in doc1_tags
+    }
+    assert doc1_tags_data == doc1_expected_tags
+
+    # check structured tags
+    with get_session_with_current_tenant() as db_session:
+        doc1_metadata = get_structured_tags_for_document(doc1.id, db_session)
+    assert doc1_metadata == doc1_expected_metadata
+
+    # update metadata
+    doc1_new_expected_metadata: dict[str, str | list[str]] = {
+        "value": "val2",
+        "multiple_list": ["a", "d"],
+        "new_value": "new_val",
+    }
+    doc1_new_expected_tags: set[tuple[str, str, bool]] = {
+        ("value", "val2", False),
+        ("multiple_list", "a", True),
+        ("multiple_list", "d", True),
+        ("new_value", "new_val", False),
+    }
+    doc1_new = DocumentManager.seed_doc_with_content(
+        cc_pair=cc_pair,
+        content="Dummy content",
+        document_id="doc1",
+        metadata=doc1_new_expected_metadata,
+        api_key=api_key,
+    )
+    assert doc1_new.id == doc1.id
+
+    # these are added by the connector
+    doc1_new_expected_metadata["document_id"] = "doc1"
+    doc1_new_expected_tags.add(("document_id", "doc1", False))
+
+    # get new document from db
+    with get_session_with_current_tenant() as db_session:
+        doc1_new_db = db_session.query(Document).filter(Document.id == doc1.id).first()
+        assert doc1_new_db is not None
+        assert doc1_new_db.id == doc1.id
+
+        doc1_new_tags = doc1_new_db.tags
+
+    # check tags
+    doc1_new_tags_data: set[tuple[str, str, bool]] = {
+        (tag.tag_key, tag.tag_value, tag.is_list) for tag in doc1_new_tags
+    }
+    assert doc1_new_tags_data == doc1_new_expected_tags
+
+    # check structured tags
+    with get_session_with_current_tenant() as db_session:
+        doc1_new_metadata = get_structured_tags_for_document(doc1.id, db_session)
+    assert doc1_new_metadata == doc1_new_expected_metadata
+
+
+def test_tag_sharing(reset: None) -> None:
+    # create admin user
+    admin_user: DATestUser = UserManager.create(email="admin@onyx.app")
+
+    # create a minimal file connector
+    cc_pair = CCPairManager.create_from_scratch(
+        name="KG-Test-FileConnector",
+        source=DocumentSource.FILE,
+        input_type=InputType.LOAD_STATE,
+        connector_specific_config={
+            "file_locations": [],
+            "file_names": [],
+            "zip_metadata": {},
+        },
+        user_performing_action=admin_user,
+    )
+    api_key = APIKeyManager.create(user_performing_action=admin_user)
+    api_key.headers.update(admin_user.headers)
+    LLMProviderManager.create(user_performing_action=admin_user)
+
+    # create documents
+    doc1_expected_metadata: dict[str, str | list[str]] = {
+        "value": "val",
+        "list": ["a", "b"],
+        "same_key": "x",
+    }
+    doc1_expected_tags: set[tuple[str, str, bool]] = {
+        ("value", "val", False),
+        ("list", "a", True),
+        ("list", "b", True),
+        ("same_key", "x", False),
+    }
+    doc1 = DocumentManager.seed_doc_with_content(
+        cc_pair=cc_pair,
+        content="Dummy content",
+        document_id="doc1",
+        metadata=doc1_expected_metadata,
+        api_key=api_key,
+    )
+
+    doc2_expected_metadata: dict[str, str | list[str]] = {
+        "value": "val",
+        "list": ["a", "c"],
+        "same_key": ["x"],
+    }
+    doc2_expected_tags: set[tuple[str, str, bool]] = {
+        ("value", "val", False),
+        ("list", "a", True),
+        ("list", "c", True),
+        ("same_key", "x", True),
+    }
+    doc2 = DocumentManager.seed_doc_with_content(
+        cc_pair=cc_pair,
+        content="Dummy content",
+        document_id="doc2",
+        metadata=doc2_expected_metadata,
+        api_key=api_key,
+    )
+
+    # these are added by the connector
+    doc1_expected_metadata["document_id"] = "doc1"
+    doc1_expected_tags.add(("document_id", "doc1", False))
+    doc2_expected_metadata["document_id"] = "doc2"
+    doc2_expected_tags.add(("document_id", "doc2", False))
+
+    # get documents from db
+    with get_session_with_current_tenant() as db_session:
+        doc1_db = db_session.query(Document).filter(Document.id == doc1.id).first()
+        doc2_db = db_session.query(Document).filter(Document.id == doc2.id).first()
+        assert doc1_db is not None
+        assert doc1_db.id == doc1.id
+        assert doc2_db is not None
+        assert doc2_db.id == doc2.id
+
+        doc1_tags = doc1_db.tags
+        doc2_tags = doc2_db.tags
+
+    # check tags
+    doc1_tags_data: set[tuple[str, str, bool]] = {
+        (tag.tag_key, tag.tag_value, tag.is_list) for tag in doc1_tags
+    }
+    assert doc1_tags_data == doc1_expected_tags
+
+    doc2_tags_data: set[tuple[str, str, bool]] = {
+        (tag.tag_key, tag.tag_value, tag.is_list) for tag in doc2_tags
+    }
+    assert doc2_tags_data == doc2_expected_tags
+
+    # check tag sharing
+    doc1_tagkv_id: dict[tuple[str, str], int] = {
+        (tag.tag_key, tag.tag_value): tag.id for tag in doc1_tags
+    }
+    doc2_tagkv_id: dict[tuple[str, str], int] = {
+        (tag.tag_key, tag.tag_value): tag.id for tag in doc2_tags
+    }
+    assert doc1_tagkv_id[("value", "val")] == doc2_tagkv_id[("value", "val")]
+    assert doc1_tagkv_id[("list", "a")] == doc2_tagkv_id[("list", "a")]
+    assert doc1_tagkv_id[("same_key", "x")] != doc2_tagkv_id[("same_key", "x")]

From 1ba4ce04f74bdd88e7b2f42f1664b82f0fff1b04 Mon Sep 17 00:00:00 2001
From: Evan Lohn <evan@danswer.ai>
Date: Mon, 11 Aug 2025 09:57:44 -0700
Subject: [PATCH 75/78] fix: max tokens param (#5174)

* max tokens param

* fix unit test

* fix unit test
---
 backend/onyx/llm/chat_llm.py                 | 13 ++++++++++++-
 backend/tests/unit/onyx/llm/test_chat_llm.py |  2 --
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/backend/onyx/llm/chat_llm.py b/backend/onyx/llm/chat_llm.py
index 1b6586b45ba..4a0a54c0452 100644
--- a/backend/onyx/llm/chat_llm.py
+++ b/backend/onyx/llm/chat_llm.py
@@ -24,6 +24,7 @@
 from langchain_core.messages.tool import ToolCallChunk
 from langchain_core.messages.tool import ToolMessage
 from langchain_core.prompt_values import PromptValue
+from litellm.utils import get_supported_openai_params
 
 from onyx.configs.app_configs import LOG_DANSWER_MODEL_INTERACTIONS
 from onyx.configs.app_configs import MOCK_LLM_RESPONSE
@@ -52,6 +53,8 @@
 _LLM_PROMPT_LONG_TERM_LOG_CATEGORY = "llm_prompt"
 VERTEX_CREDENTIALS_FILE_KWARG = "vertex_credentials"
 VERTEX_LOCATION_KWARG = "vertex_location"
+LEGACY_MAX_TOKENS_KWARG = "max_tokens"
+STANDARD_MAX_TOKENS_KWARG = "max_completion_tokens"
 
 
 class LLMTimeoutError(Exception):
@@ -313,6 +316,14 @@ def __init__(
 
         self._model_kwargs = model_kwargs
 
+        self._max_token_param = LEGACY_MAX_TOKENS_KWARG
+        try:
+            params = get_supported_openai_params(model_name, model_provider)
+            if STANDARD_MAX_TOKENS_KWARG in (params or []):
+                self._max_token_param = STANDARD_MAX_TOKENS_KWARG
+        except Exception as e:
+            logger.warning(f"Error getting supported openai params: {e}")
+
     def _safe_model_config(self) -> dict:
         dump = self.config.model_dump()
         dump["api_key"] = mask_string(dump.get("api_key", ""))
@@ -393,7 +404,6 @@ def _completion(
                 messages=processed_prompt,
                 tools=tools,
                 tool_choice=tool_choice if tools else None,
-                max_tokens=max_tokens,
                 # streaming choice
                 stream=stream,
                 # model params
@@ -426,6 +436,7 @@ def _completion(
                     if structured_response_format
                     else {}
                 ),
+                **({self._max_token_param: max_tokens} if max_tokens else {}),
                 **self._model_kwargs,
             )
         except Exception as e:
diff --git a/backend/tests/unit/onyx/llm/test_chat_llm.py b/backend/tests/unit/onyx/llm/test_chat_llm.py
index db8ecb6babf..fcbd6567d7c 100644
--- a/backend/tests/unit/onyx/llm/test_chat_llm.py
+++ b/backend/tests/unit/onyx/llm/test_chat_llm.py
@@ -148,7 +148,6 @@ def test_multiple_tool_calls(default_multi_llm: DefaultMultiLLM) -> None:
             ],
             tools=tools,
             tool_choice=None,
-            max_tokens=None,
             stream=False,
             temperature=0.0,  # Default value from GEN_AI_TEMPERATURE
             timeout=30,
@@ -294,7 +293,6 @@ def test_multiple_tool_calls_streaming(default_multi_llm: DefaultMultiLLM) -> No
             ],
             tools=tools,
             tool_choice=None,
-            max_tokens=None,
             stream=True,
             temperature=0.0,  # Default value from GEN_AI_TEMPERATURE
             timeout=30,

From 948fa522aeb8edf6f03161e96df2a338ff0b447d Mon Sep 17 00:00:00 2001
From: SubashMohan <subashmohan75@gmail.com>
Date: Mon, 11 Aug 2025 22:29:16 +0530
Subject: [PATCH 76/78] feat: sharepoint perm sync (#5033)

* sharepoint perm sync first draft

* feat: Implement SharePoint permission synchronization

* mypy fix

* remove commented code

* bot comments fixes and job failure fixes

* introduce generic way to upload certificates in credentials

* mypy fix

* add checkpoiting to sharepoint connector

* add sharepoint integration tests

* Refactor SharePoint connector to derive tenant domain from verified domains and remove direct tenant domain input from credentials

* address review comments

* add permission sync to site pages

* mypy fix

* fix tests error

* fix tests and address comments

* Update file extraction behavior in SharePoint connector to continue processing on unprocessable files
---
 backend/ee/onyx/configs/app_configs.py        |  13 +
 .../sharepoint/doc_sync.py                    |  36 +
 .../sharepoint/group_sync.py                  |  63 ++
 .../sharepoint/permission_utils.py            | 658 ++++++++++++
 .../onyx/external_permissions/sync_params.py  |  16 +
 .../onyx/connectors/sharepoint/connector.py   | 942 ++++++++++++++++--
 .../connectors/sharepoint/connector_utils.py  |  38 +
 backend/onyx/server/documents/credential.py   | 120 +++
 .../onyx/server/documents/document_utils.py   |  75 ++
 .../server/documents/private_key_types.py     |  57 ++
 .../sharepoint/test_sharepoint_connector.py   | 327 +++---
 .../sharepoint/conftest.py                    | 113 +++
 .../sharepoint/test_sharepoint_permissions.py | 205 ++++
 web/src/components/Field.tsx                  | 121 +++
 .../admin/connectors/CredentialForm.tsx       |  25 +-
 .../credentials/CredentialSection.tsx         |  20 +-
 .../credentials/actions/CreateCredential.tsx  |  13 +-
 .../actions/CredentialFieldsRenderer.tsx      |  28 +-
 .../credentials/actions/EditCredential.tsx    |  43 +-
 web/src/components/credentials/lib.ts         |  31 +-
 web/src/components/credentials/types.ts       |   4 +-
 .../lib/connectors/AutoSyncOptionFields.tsx   |   1 +
 web/src/lib/connectors/credentials.ts         |  42 +-
 web/src/lib/connectors/fileTypes.ts           | 117 +++
 web/src/lib/constants.ts                      |   8 +
 web/src/lib/credential.ts                     |  73 +-
 web/src/lib/types.ts                          |   1 +
 27 files changed, 2927 insertions(+), 263 deletions(-)
 create mode 100644 backend/ee/onyx/external_permissions/sharepoint/doc_sync.py
 create mode 100644 backend/ee/onyx/external_permissions/sharepoint/group_sync.py
 create mode 100644 backend/ee/onyx/external_permissions/sharepoint/permission_utils.py
 create mode 100644 backend/onyx/connectors/sharepoint/connector_utils.py
 create mode 100644 backend/onyx/server/documents/document_utils.py
 create mode 100644 backend/onyx/server/documents/private_key_types.py
 create mode 100644 backend/tests/integration/connector_job_tests/sharepoint/conftest.py
 create mode 100644 backend/tests/integration/connector_job_tests/sharepoint/test_sharepoint_permissions.py
 create mode 100644 web/src/lib/connectors/fileTypes.ts

diff --git a/backend/ee/onyx/configs/app_configs.py b/backend/ee/onyx/configs/app_configs.py
index 3a2cc3cc905..6b0cccb7873 100644
--- a/backend/ee/onyx/configs/app_configs.py
+++ b/backend/ee/onyx/configs/app_configs.py
@@ -102,6 +102,19 @@
     os.environ.get("TEAMS_PERMISSION_DOC_SYNC_FREQUENCY") or 5 * 60
 )
 
+#####
+# SharePoint
+#####
+# In seconds, default is 30 minutes
+SHAREPOINT_PERMISSION_DOC_SYNC_FREQUENCY = int(
+    os.environ.get("SHAREPOINT_PERMISSION_DOC_SYNC_FREQUENCY") or 30 * 60
+)
+
+# In seconds, default is 5 minutes
+SHAREPOINT_PERMISSION_GROUP_SYNC_FREQUENCY = int(
+    os.environ.get("SHAREPOINT_PERMISSION_GROUP_SYNC_FREQUENCY") or 5 * 60
+)
+
 
 ####
 # Celery Job Frequency
diff --git a/backend/ee/onyx/external_permissions/sharepoint/doc_sync.py b/backend/ee/onyx/external_permissions/sharepoint/doc_sync.py
new file mode 100644
index 00000000000..18b071c56a5
--- /dev/null
+++ b/backend/ee/onyx/external_permissions/sharepoint/doc_sync.py
@@ -0,0 +1,36 @@
+from collections.abc import Generator
+
+from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsFunction
+from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsIdsFunction
+from ee.onyx.external_permissions.utils import generic_doc_sync
+from onyx.access.models import DocExternalAccess
+from onyx.configs.constants import DocumentSource
+from onyx.connectors.sharepoint.connector import SharepointConnector
+from onyx.db.models import ConnectorCredentialPair
+from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+SHAREPOINT_DOC_SYNC_TAG = "sharepoint_doc_sync"
+
+
+def sharepoint_doc_sync(
+    cc_pair: ConnectorCredentialPair,
+    fetch_all_existing_docs_fn: FetchAllDocumentsFunction,
+    fetch_all_existing_docs_ids_fn: FetchAllDocumentsIdsFunction,
+    callback: IndexingHeartbeatInterface | None = None,
+) -> Generator[DocExternalAccess, None, None]:
+    sharepoint_connector = SharepointConnector(
+        **cc_pair.connector.connector_specific_config,
+    )
+    sharepoint_connector.load_credentials(cc_pair.credential.credential_json)
+
+    yield from generic_doc_sync(
+        cc_pair=cc_pair,
+        fetch_all_existing_docs_ids_fn=fetch_all_existing_docs_ids_fn,
+        callback=callback,
+        doc_source=DocumentSource.SHAREPOINT,
+        slim_connector=sharepoint_connector,
+        label=SHAREPOINT_DOC_SYNC_TAG,
+    )
diff --git a/backend/ee/onyx/external_permissions/sharepoint/group_sync.py b/backend/ee/onyx/external_permissions/sharepoint/group_sync.py
new file mode 100644
index 00000000000..cad0250043e
--- /dev/null
+++ b/backend/ee/onyx/external_permissions/sharepoint/group_sync.py
@@ -0,0 +1,63 @@
+from collections.abc import Generator
+
+from office365.sharepoint.client_context import ClientContext  # type: ignore[import-untyped]
+
+from ee.onyx.db.external_perm import ExternalUserGroup
+from ee.onyx.external_permissions.sharepoint.permission_utils import (
+    get_sharepoint_external_groups,
+)
+from onyx.connectors.sharepoint.connector import acquire_token_for_rest
+from onyx.connectors.sharepoint.connector import SharepointConnector
+from onyx.db.models import ConnectorCredentialPair
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+def sharepoint_group_sync(
+    tenant_id: str,
+    cc_pair: ConnectorCredentialPair,
+) -> Generator[ExternalUserGroup, None, None]:
+    """Sync SharePoint groups and their members"""
+
+    # Get site URLs from connector config
+    connector_config = cc_pair.connector.connector_specific_config
+
+    # Create SharePoint connector instance and load credentials
+    connector = SharepointConnector(**connector_config)
+    connector.load_credentials(cc_pair.credential.credential_json)
+
+    if not connector.msal_app:
+        raise RuntimeError("MSAL app not initialized in connector")
+
+    if not connector.sp_tenant_domain:
+        raise RuntimeError("Tenant domain not initialized in connector")
+
+    # Get site descriptors from connector (either configured sites or all sites)
+    site_descriptors = connector.site_descriptors or connector.fetch_sites()
+
+    if not site_descriptors:
+        raise RuntimeError("No SharePoint sites found for group sync")
+
+    logger.info(f"Processing {len(site_descriptors)} sites for group sync")
+
+    msal_app = connector.msal_app
+    sp_tenant_domain = connector.sp_tenant_domain
+    # Process each site
+    for site_descriptor in site_descriptors:
+        logger.debug(f"Processing site: {site_descriptor.url}")
+
+        # Create client context for the site using connector's MSAL app
+        ctx = ClientContext(site_descriptor.url).with_access_token(
+            lambda: acquire_token_for_rest(msal_app, sp_tenant_domain)
+        )
+
+        # Get external groups for this site
+        external_groups = get_sharepoint_external_groups(ctx, connector.graph_client)
+
+        # Yield each group
+        for group in external_groups:
+            logger.debug(
+                f"Found group: {group.id} with {len(group.user_emails)} members"
+            )
+            yield group
diff --git a/backend/ee/onyx/external_permissions/sharepoint/permission_utils.py b/backend/ee/onyx/external_permissions/sharepoint/permission_utils.py
new file mode 100644
index 00000000000..46e58e2739e
--- /dev/null
+++ b/backend/ee/onyx/external_permissions/sharepoint/permission_utils.py
@@ -0,0 +1,658 @@
+import re
+from collections import deque
+from typing import Any
+
+from office365.graph_client import GraphClient  # type: ignore[import-untyped]
+from office365.onedrive.driveitems.driveItem import DriveItem  # type: ignore[import-untyped]
+from office365.sharepoint.client_context import ClientContext  # type: ignore[import-untyped]
+from office365.sharepoint.permissions.securable_object import RoleAssignmentCollection  # type: ignore[import-untyped]
+from pydantic import BaseModel
+
+from ee.onyx.db.external_perm import ExternalUserGroup
+from onyx.access.models import ExternalAccess
+from onyx.access.utils import build_ext_group_name_for_onyx
+from onyx.configs.constants import DocumentSource
+from onyx.connectors.sharepoint.connector import sleep_and_retry
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+# These values represent different types of SharePoint principals used in permission assignments
+USER_PRINCIPAL_TYPE = 1  # Individual user accounts
+ANONYMOUS_USER_PRINCIPAL_TYPE = 3  # Anonymous/unauthenticated users (public access)
+AZURE_AD_GROUP_PRINCIPAL_TYPE = 4  # Azure Active Directory security groups
+SHAREPOINT_GROUP_PRINCIPAL_TYPE = 8  # SharePoint site groups (local to the site)
+MICROSOFT_DOMAIN = ".onmicrosoft"
+# Limited Access role type, limited access is a travel through permission not a actual permission
+LIMITED_ACCESS_ROLE_TYPES = [1, 9]
+LIMITED_ACCESS_ROLE_NAMES = ["Limited Access", "Web-Only Limited Access"]
+
+
+class SharepointGroup(BaseModel):
+    model_config = {"frozen": True}
+
+    name: str
+    login_name: str
+    principal_type: int
+
+
+class GroupsResult(BaseModel):
+    groups_to_emails: dict[str, set[str]]
+    found_public_group: bool
+
+
+def _get_azuread_group_guid_by_name(
+    graph_client: GraphClient, group_name: str
+) -> str | None:
+    try:
+        # Search for groups by display name
+        groups = sleep_and_retry(
+            graph_client.groups.filter(f"displayName eq '{group_name}'").get(),
+            "get_azuread_group_guid_by_name",
+        )
+
+        if groups and len(groups) > 0:
+            return groups[0].id
+
+        return None
+
+    except Exception as e:
+        logger.error(f"Failed to get Azure AD group GUID for name {group_name}: {e}")
+        return None
+
+
+def _extract_guid_from_claims_token(claims_token: str) -> str | None:
+
+    try:
+        # Pattern to match GUID in claims token
+        # Claims tokens often have format: c:0o.c|provider|GUID_suffix
+        guid_pattern = r"([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})"
+
+        match = re.search(guid_pattern, claims_token, re.IGNORECASE)
+        if match:
+            return match.group(1)
+
+        return None
+
+    except Exception as e:
+        logger.error(f"Failed to extract GUID from claims token {claims_token}: {e}")
+        return None
+
+
+def _get_group_guid_from_identifier(
+    graph_client: GraphClient, identifier: str
+) -> str | None:
+    try:
+        # Check if it's already a GUID
+        guid_pattern = r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$"
+        if re.match(guid_pattern, identifier, re.IGNORECASE):
+            return identifier
+
+        # Check if it's a SharePoint claims token
+        if identifier.startswith("c:0") and "|" in identifier:
+            guid = _extract_guid_from_claims_token(identifier)
+            if guid:
+                logger.info(f"Extracted GUID {guid} from claims token {identifier}")
+                return guid
+
+        # Try to search by display name as fallback
+        return _get_azuread_group_guid_by_name(graph_client, identifier)
+
+    except Exception as e:
+        logger.error(f"Failed to get group GUID from identifier {identifier}: {e}")
+        return None
+
+
+def _get_security_group_owners(graph_client: GraphClient, group_id: str) -> list[str]:
+    try:
+        # Get group owners using Graph API
+        group = graph_client.groups[group_id]
+        owners = sleep_and_retry(
+            group.owners.get_all(page_loaded=lambda _: None),
+            "get_security_group_owners",
+        )
+
+        owner_emails: list[str] = []
+        logger.info(f"Owners: {owners}")
+
+        for owner in owners:
+            owner_data = owner.to_json()
+
+            # Extract email from the JSON data
+            mail: str | None = owner_data.get("mail")
+            user_principal_name: str | None = owner_data.get("userPrincipalName")
+
+            # Check if owner is a user and has an email
+            if mail:
+                if MICROSOFT_DOMAIN in mail:
+                    mail = mail.replace(MICROSOFT_DOMAIN, "")
+                owner_emails.append(mail)
+            elif user_principal_name:
+                if MICROSOFT_DOMAIN in user_principal_name:
+                    user_principal_name = user_principal_name.replace(
+                        MICROSOFT_DOMAIN, ""
+                    )
+                owner_emails.append(user_principal_name)
+
+        logger.info(
+            f"Retrieved {len(owner_emails)} owners from security group {group_id}"
+        )
+        return owner_emails
+
+    except Exception as e:
+        logger.error(f"Failed to get security group owners for group {group_id}: {e}")
+        return []
+
+
+def _get_sharepoint_list_item_id(drive_item: DriveItem) -> str | None:
+
+    try:
+        # First try to get the list item directly from the drive item
+        if hasattr(drive_item, "listItem"):
+            list_item = drive_item.listItem
+            if list_item:
+                # Load the list item properties to get the ID
+                sleep_and_retry(list_item.get(), "get_sharepoint_list_item_id")
+                if hasattr(list_item, "id") and list_item.id:
+                    return str(list_item.id)
+
+        # The SharePoint list item ID is typically available in the sharepointIds property
+        sharepoint_ids = getattr(drive_item, "sharepoint_ids", None)
+        if sharepoint_ids and hasattr(sharepoint_ids, "listItemId"):
+            return sharepoint_ids.listItemId
+
+        # Alternative: try to get it from the properties
+        properties = getattr(drive_item, "properties", None)
+        if properties:
+            # Sometimes the SharePoint list item ID is in the properties
+            for prop_name, prop_value in properties.items():
+                if "listitemid" in prop_name.lower():
+                    return str(prop_value)
+
+        return None
+    except Exception as e:
+        logger.error(
+            f"Error getting SharePoint list item ID for item {drive_item.id}: {e}"
+        )
+        raise e
+
+
+def _is_public_item(drive_item: DriveItem) -> bool:
+    is_public = False
+    try:
+        permissions = sleep_and_retry(
+            drive_item.permissions.get_all(page_loaded=lambda _: None), "is_public_item"
+        )
+        for permission in permissions:
+            if permission.link and (
+                permission.link.scope == "anonymous"
+                or permission.link.scope == "organization"
+            ):
+                is_public = True
+                break
+        return is_public
+    except Exception as e:
+        logger.error(f"Failed to check if item {drive_item.id} is public: {e}")
+        return False
+
+
+def _is_public_login_name(login_name: str) -> bool:
+    # Patterns that indicate public access
+    # This list is derived from the below link
+    # https://learn.microsoft.com/en-us/answers/questions/2085339/guid-in-the-loginname-of-site-user-everyone-except
+    public_login_patterns: list[str] = [
+        "c:0-.f|rolemanager|spo-grid-all-users/",
+        "c:0(.s|true",
+    ]
+    for pattern in public_login_patterns:
+        if pattern in login_name:
+            logger.info(f"Login name {login_name} is public")
+            return True
+    return False
+
+
+# AD groups allows same display name for multiple groups, so we need to add the GUID to the name
+def _get_group_name_with_suffix(
+    login_name: str, group_name: str, graph_client: GraphClient
+) -> str:
+    ad_group_suffix = _get_group_guid_from_identifier(graph_client, login_name)
+    return f"{group_name}_{ad_group_suffix}"
+
+
+def _get_sharepoint_groups(
+    client_context: ClientContext, group_name: str, graph_client: GraphClient
+) -> tuple[set[SharepointGroup], set[str]]:
+
+    groups: set[SharepointGroup] = set()
+    user_emails: set[str] = set()
+
+    def process_users(users: list[Any]) -> None:
+        nonlocal groups, user_emails
+
+        for user in users:
+            if user.principal_type == USER_PRINCIPAL_TYPE and hasattr(
+                user, "user_principal_name"
+            ):
+                if user.user_principal_name:
+                    email = user.user_principal_name
+                    if MICROSOFT_DOMAIN in email:
+                        email = email.replace(MICROSOFT_DOMAIN, "")
+                    user_emails.add(email)
+                else:
+                    logger.warning(
+                        f"User don't have a user principal name: {user.login_name}"
+                    )
+            elif user.principal_type in [
+                AZURE_AD_GROUP_PRINCIPAL_TYPE,
+                SHAREPOINT_GROUP_PRINCIPAL_TYPE,
+            ]:
+                name = user.title
+                if user.principal_type == AZURE_AD_GROUP_PRINCIPAL_TYPE:
+                    name = _get_group_name_with_suffix(
+                        user.login_name, name, graph_client
+                    )
+                groups.add(
+                    SharepointGroup(
+                        login_name=user.login_name,
+                        principal_type=user.principal_type,
+                        name=name,
+                    )
+                )
+
+    group = client_context.web.site_groups.get_by_name(group_name)
+    sleep_and_retry(
+        group.users.get_all(page_loaded=process_users), "get_sharepoint_groups"
+    )
+
+    return groups, user_emails
+
+
+def _get_azuread_groups(
+    graph_client: GraphClient, group_name: str
+) -> tuple[set[SharepointGroup], set[str]]:
+
+    group_id = _get_group_guid_from_identifier(graph_client, group_name)
+    if not group_id:
+        logger.error(f"Failed to get Azure AD group GUID for name {group_name}")
+        return set(), set()
+    group = graph_client.groups[group_id]
+    groups: set[SharepointGroup] = set()
+    user_emails: set[str] = set()
+
+    def process_members(members: list[Any]) -> None:
+        nonlocal groups, user_emails
+
+        for member in members:
+            member_data = member.to_json()
+
+            # Check for user-specific attributes
+            user_principal_name = member_data.get("userPrincipalName")
+            mail = member_data.get("mail")
+            display_name = member_data.get("displayName") or member_data.get(
+                "display_name"
+            )
+
+            # Check object attributes directly (if available)
+            is_user = False
+            is_group = False
+
+            # Users typically have userPrincipalName or mail
+            if user_principal_name or (mail and "@" in str(mail)):
+                is_user = True
+            # Groups typically have displayName but no userPrincipalName
+            elif display_name and not user_principal_name:
+                # Additional check: try to access group-specific properties
+                if (
+                    hasattr(member, "groupTypes")
+                    or member_data.get("groupTypes") is not None
+                ):
+                    is_group = True
+                # Or check if it has an 'id' field typical for groups
+                elif member_data.get("id") and not user_principal_name:
+                    is_group = True
+
+            # Check the object type name (fallback)
+            if not is_user and not is_group:
+                obj_type = type(member).__name__.lower()
+                if "user" in obj_type:
+                    is_user = True
+                elif "group" in obj_type:
+                    is_group = True
+
+            # Process based on identification
+            if is_user:
+                if user_principal_name:
+                    email = user_principal_name
+                    if MICROSOFT_DOMAIN in email:
+                        email = email.replace(MICROSOFT_DOMAIN, "")
+                    user_emails.add(email)
+                elif mail:
+                    email = mail
+                    if MICROSOFT_DOMAIN in email:
+                        email = email.replace(MICROSOFT_DOMAIN, "")
+                    user_emails.add(email)
+                logger.info(f"Added user: {user_principal_name or mail}")
+            elif is_group:
+                if not display_name:
+                    logger.error(f"No display name for group: {member_data.get('id')}")
+                    continue
+                name = _get_group_name_with_suffix(
+                    member_data.get("id", ""), display_name, graph_client
+                )
+                groups.add(
+                    SharepointGroup(
+                        login_name=member_data.get("id", ""),  # Use ID for groups
+                        principal_type=AZURE_AD_GROUP_PRINCIPAL_TYPE,
+                        name=name,
+                    )
+                )
+                logger.info(f"Added group: {name}")
+            else:
+                # Log unidentified members for debugging
+                logger.warning(f"Could not identify member type for: {member_data}")
+
+    sleep_and_retry(
+        group.members.get_all(page_loaded=process_members), "get_azuread_groups"
+    )
+
+    owner_emails = _get_security_group_owners(graph_client, group_id)
+    user_emails.update(owner_emails)
+
+    return groups, user_emails
+
+
+def _get_groups_and_members_recursively(
+    client_context: ClientContext,
+    graph_client: GraphClient,
+    groups: set[SharepointGroup],
+) -> GroupsResult:
+    """
+    Get all groups and their members recursively.
+    """
+    group_queue: deque[SharepointGroup] = deque(groups)
+    visited_groups: set[str] = set()
+    visited_group_name_to_emails: dict[str, set[str]] = {}
+    while group_queue:
+        group = group_queue.popleft()
+        if group.login_name in visited_groups:
+            continue
+        visited_groups.add(group.login_name)
+        visited_group_name_to_emails[group.name] = set()
+        logger.info(
+            f"Processing group: {group.name} principal type: {group.principal_type}"
+        )
+        if group.principal_type == SHAREPOINT_GROUP_PRINCIPAL_TYPE:
+            group_info, user_emails = _get_sharepoint_groups(
+                client_context, group.login_name, graph_client
+            )
+            visited_group_name_to_emails[group.name].update(user_emails)
+            if group_info:
+                group_queue.extend(group_info)
+        if group.principal_type == AZURE_AD_GROUP_PRINCIPAL_TYPE:
+            # if the site is public, we have default groups assigned to it, so we return early
+            if _is_public_login_name(group.login_name):
+                return GroupsResult(groups_to_emails={}, found_public_group=True)
+
+            group_info, user_emails = _get_azuread_groups(
+                graph_client, group.login_name
+            )
+            visited_group_name_to_emails[group.name].update(user_emails)
+            if group_info:
+                group_queue.extend(group_info)
+
+    return GroupsResult(
+        groups_to_emails=visited_group_name_to_emails, found_public_group=False
+    )
+
+
+def get_external_access_from_sharepoint(
+    client_context: ClientContext,
+    graph_client: GraphClient,
+    drive_name: str | None,
+    drive_item: DriveItem | None,
+    site_page: dict[str, Any] | None,
+    add_prefix: bool = False,
+) -> ExternalAccess:
+    """
+    Get external access information from SharePoint.
+    """
+    groups: set[SharepointGroup] = set()
+    user_emails: set[str] = set()
+    group_ids: set[str] = set()
+
+    # Add all members to a processing set first
+    def add_user_and_group_to_sets(
+        role_assignments: RoleAssignmentCollection,
+    ) -> None:
+        nonlocal user_emails, groups
+        for assignment in role_assignments:
+            if assignment.role_definition_bindings:
+                is_limited_access = True
+                for role_definition_binding in assignment.role_definition_bindings:
+                    if (
+                        role_definition_binding.role_type_kind
+                        not in LIMITED_ACCESS_ROLE_TYPES
+                        or role_definition_binding.name not in LIMITED_ACCESS_ROLE_NAMES
+                    ):
+                        is_limited_access = False
+                        break
+
+                # Skip if the role is only Limited Access, because this is not a actual permission its a travel through permission
+                if is_limited_access:
+                    logger.info(
+                        "Skipping assignment because it has only Limited Access role"
+                    )
+                    continue
+            if assignment.member:
+                member = assignment.member
+                if member.principal_type == USER_PRINCIPAL_TYPE and hasattr(
+                    member, "user_principal_name"
+                ):
+                    email = member.user_principal_name
+                    if MICROSOFT_DOMAIN in email:
+                        email = email.replace(MICROSOFT_DOMAIN, "")
+                    user_emails.add(email)
+                elif member.principal_type in [
+                    AZURE_AD_GROUP_PRINCIPAL_TYPE,
+                    SHAREPOINT_GROUP_PRINCIPAL_TYPE,
+                ]:
+                    name = member.title
+                    if member.principal_type == AZURE_AD_GROUP_PRINCIPAL_TYPE:
+                        name = _get_group_name_with_suffix(
+                            member.login_name, name, graph_client
+                        )
+                    groups.add(
+                        SharepointGroup(
+                            login_name=member.login_name,
+                            principal_type=member.principal_type,
+                            name=name,
+                        )
+                    )
+
+    if drive_item and drive_name:
+        # Here we check if the item have have any public links, if so we return early
+        is_public = _is_public_item(drive_item)
+        if is_public:
+            logger.info(f"Item {drive_item.id} is public")
+            return ExternalAccess(
+                external_user_emails=set(),
+                external_user_group_ids=set(),
+                is_public=True,
+            )
+
+        item_id = _get_sharepoint_list_item_id(drive_item)
+
+        if not item_id:
+            raise RuntimeError(
+                f"Failed to get SharePoint list item ID for item {drive_item.id}"
+            )
+
+        if drive_name == "Shared Documents":
+            drive_name = "Documents"
+
+        item = client_context.web.lists.get_by_title(drive_name).items.get_by_id(
+            item_id
+        )
+
+        sleep_and_retry(
+            item.role_assignments.expand(["Member", "RoleDefinitionBindings"]).get_all(
+                page_loaded=add_user_and_group_to_sets,
+            ),
+            "get_external_access_from_sharepoint",
+        )
+    elif site_page:
+        site_url = site_page.get("webUrl")
+        site_pages = client_context.web.lists.get_by_title("Site Pages")
+        client_context.load(site_pages)
+        client_context.execute_query()
+        site_pages.items.get_by_url(site_url).role_assignments.expand(
+            ["Member", "RoleDefinitionBindings"]
+        ).get_all(page_loaded=add_user_and_group_to_sets).execute_query()
+    else:
+        raise RuntimeError("No drive item or site page provided")
+
+    groups_and_members: GroupsResult = _get_groups_and_members_recursively(
+        client_context, graph_client, groups
+    )
+
+    # If the site is public, w have default groups assigned to it, so we return early
+    if groups_and_members.found_public_group:
+        return ExternalAccess(
+            external_user_emails=set(),
+            external_user_group_ids=set(),
+            is_public=True,
+        )
+
+    for group_name, _ in groups_and_members.groups_to_emails.items():
+        if add_prefix:
+            group_name = build_ext_group_name_for_onyx(
+                group_name, DocumentSource.SHAREPOINT
+            )
+        group_ids.add(group_name.lower())
+
+    logger.info(f"User emails: {len(user_emails)}")
+    logger.info(f"Group IDs: {len(group_ids)}")
+
+    return ExternalAccess(
+        external_user_emails=user_emails,
+        external_user_group_ids=group_ids,
+        is_public=False,
+    )
+
+
+def get_sharepoint_external_groups(
+    client_context: ClientContext, graph_client: GraphClient
+) -> list[ExternalUserGroup]:
+
+    groups: set[SharepointGroup] = set()
+
+    def add_group_to_sets(role_assignments: RoleAssignmentCollection) -> None:
+        nonlocal groups
+        for assignment in role_assignments:
+            if assignment.role_definition_bindings:
+                is_limited_access = True
+                for role_definition_binding in assignment.role_definition_bindings:
+                    if (
+                        role_definition_binding.role_type_kind
+                        not in LIMITED_ACCESS_ROLE_TYPES
+                        or role_definition_binding.name not in LIMITED_ACCESS_ROLE_NAMES
+                    ):
+                        is_limited_access = False
+                        break
+
+                # Skip if the role assignment is only Limited Access, because this is not a actual permission its
+                #  a travel through permission
+                if is_limited_access:
+                    logger.info(
+                        "Skipping assignment because it has only Limited Access role"
+                    )
+                    continue
+            if assignment.member:
+                member = assignment.member
+                if member.principal_type in [
+                    AZURE_AD_GROUP_PRINCIPAL_TYPE,
+                    SHAREPOINT_GROUP_PRINCIPAL_TYPE,
+                ]:
+                    name = member.title
+                    if member.principal_type == AZURE_AD_GROUP_PRINCIPAL_TYPE:
+                        name = _get_group_name_with_suffix(
+                            member.login_name, name, graph_client
+                        )
+
+                    groups.add(
+                        SharepointGroup(
+                            login_name=member.login_name,
+                            principal_type=member.principal_type,
+                            name=name,
+                        )
+                    )
+
+    sleep_and_retry(
+        client_context.web.role_assignments.expand(
+            ["Member", "RoleDefinitionBindings"]
+        ).get_all(page_loaded=add_group_to_sets),
+        "get_sharepoint_external_groups",
+    )
+    groups_and_members: GroupsResult = _get_groups_and_members_recursively(
+        client_context, graph_client, groups
+    )
+
+    # We don't have any direct way to check if the site is public, so we check if any public group is present
+    if groups_and_members.found_public_group:
+        return []
+
+    # get all Azure AD groups because if any group is assigned to the drive item, we don't want to miss them
+    # We can't assign sharepoint groups to drive items or drives, so we don't need to get all sharepoint groups
+    azure_ad_groups = sleep_and_retry(
+        graph_client.groups.get_all(page_loaded=lambda _: None),
+        "get_sharepoint_external_groups:get_azure_ad_groups",
+    )
+    logger.info(f"Azure AD Groups: {len(azure_ad_groups)}")
+    identified_groups: set[str] = set(groups_and_members.groups_to_emails.keys())
+    ad_groups_to_emails: dict[str, set[str]] = {}
+    for group in azure_ad_groups:
+        # If the group is already identified, we don't need to get the members
+        if group.display_name in identified_groups:
+            continue
+        # AD groups allows same display name for multiple groups, so we need to add the GUID to the name
+        name = group.display_name
+        name = _get_group_name_with_suffix(group.id, name, graph_client)
+
+        members = sleep_and_retry(
+            group.members.get_all(page_loaded=lambda _: None),
+            "get_sharepoint_external_groups:get_azure_ad_groups:get_members",
+        )
+        for member in members:
+            member_data = member.to_json()
+            user_principal_name = member_data.get("userPrincipalName")
+            mail = member_data.get("mail")
+            if not ad_groups_to_emails.get(name):
+                ad_groups_to_emails[name] = set()
+            if user_principal_name:
+                if MICROSOFT_DOMAIN in user_principal_name:
+                    user_principal_name = user_principal_name.replace(
+                        MICROSOFT_DOMAIN, ""
+                    )
+                ad_groups_to_emails[name].add(user_principal_name)
+            elif mail:
+                if MICROSOFT_DOMAIN in mail:
+                    mail = mail.replace(MICROSOFT_DOMAIN, "")
+                ad_groups_to_emails[name].add(mail)
+
+    external_user_groups: list[ExternalUserGroup] = []
+    for group_name, emails in groups_and_members.groups_to_emails.items():
+        external_user_group = ExternalUserGroup(
+            id=group_name,
+            user_emails=list(emails),
+        )
+        external_user_groups.append(external_user_group)
+
+    for group_name, emails in ad_groups_to_emails.items():
+        external_user_group = ExternalUserGroup(
+            id=group_name,
+            user_emails=list(emails),
+        )
+        external_user_groups.append(external_user_group)
+
+    return external_user_groups
diff --git a/backend/ee/onyx/external_permissions/sync_params.py b/backend/ee/onyx/external_permissions/sync_params.py
index 6ebaec10372..d60d79e56e8 100644
--- a/backend/ee/onyx/external_permissions/sync_params.py
+++ b/backend/ee/onyx/external_permissions/sync_params.py
@@ -11,6 +11,8 @@
 from ee.onyx.configs.app_configs import GITHUB_PERMISSION_GROUP_SYNC_FREQUENCY
 from ee.onyx.configs.app_configs import GOOGLE_DRIVE_PERMISSION_GROUP_SYNC_FREQUENCY
 from ee.onyx.configs.app_configs import JIRA_PERMISSION_DOC_SYNC_FREQUENCY
+from ee.onyx.configs.app_configs import SHAREPOINT_PERMISSION_DOC_SYNC_FREQUENCY
+from ee.onyx.configs.app_configs import SHAREPOINT_PERMISSION_GROUP_SYNC_FREQUENCY
 from ee.onyx.configs.app_configs import SLACK_PERMISSION_DOC_SYNC_FREQUENCY
 from ee.onyx.configs.app_configs import TEAMS_PERMISSION_DOC_SYNC_FREQUENCY
 from ee.onyx.external_permissions.confluence.doc_sync import confluence_doc_sync
@@ -29,6 +31,8 @@
 from ee.onyx.external_permissions.salesforce.postprocessing import (
     censor_salesforce_chunks,
 )
+from ee.onyx.external_permissions.sharepoint.doc_sync import sharepoint_doc_sync
+from ee.onyx.external_permissions.sharepoint.group_sync import sharepoint_group_sync
 from ee.onyx.external_permissions.slack.doc_sync import slack_doc_sync
 from ee.onyx.external_permissions.teams.doc_sync import teams_doc_sync
 from onyx.configs.constants import DocumentSource
@@ -156,6 +160,18 @@ def mock_doc_sync(
             initial_index_should_sync=True,
         ),
     ),
+    DocumentSource.SHAREPOINT: SyncConfig(
+        doc_sync_config=DocSyncConfig(
+            doc_sync_frequency=SHAREPOINT_PERMISSION_DOC_SYNC_FREQUENCY,
+            doc_sync_func=sharepoint_doc_sync,
+            initial_index_should_sync=True,
+        ),
+        group_sync_config=GroupSyncConfig(
+            group_sync_frequency=SHAREPOINT_PERMISSION_GROUP_SYNC_FREQUENCY,
+            group_sync_func=sharepoint_group_sync,
+            group_sync_is_cc_pair_agnostic=False,
+        ),
+    ),
 }
 
 
diff --git a/backend/onyx/connectors/sharepoint/connector.py b/backend/onyx/connectors/sharepoint/connector.py
index cef0723ccc0..e524d08a935 100644
--- a/backend/onyx/connectors/sharepoint/connector.py
+++ b/backend/onyx/connectors/sharepoint/connector.py
@@ -1,40 +1,65 @@
+import base64
+import copy
 import html
 import io
 import os
 import re
 import time
+from collections import deque
 from collections.abc import Generator
 from datetime import datetime
 from datetime import timezone
+from enum import Enum
 from typing import Any
 from typing import cast
 from urllib.parse import unquote
 
-import msal  # type: ignore
+import msal  # type: ignore[import-untyped]
 import requests
-from office365.graph_client import GraphClient  # type: ignore
-from office365.onedrive.driveitems.driveItem import DriveItem  # type: ignore
-from office365.onedrive.sites.site import Site  # type: ignore
-from office365.onedrive.sites.sites_with_root import SitesWithRoot  # type: ignore
+from cryptography.hazmat.primitives import hashes
+from cryptography.hazmat.primitives import serialization
+from cryptography.hazmat.primitives.serialization import pkcs12
+from office365.graph_client import GraphClient  # type: ignore[import-untyped]
+from office365.intune.organizations.organization import Organization  # type: ignore[import-untyped]
+from office365.onedrive.driveitems.driveItem import DriveItem  # type: ignore[import-untyped]
+from office365.onedrive.sites.site import Site  # type: ignore[import-untyped]
+from office365.onedrive.sites.sites_with_root import SitesWithRoot  # type: ignore[import-untyped]
+from office365.runtime.auth.token_response import TokenResponse  # type: ignore[import-untyped]
 from office365.runtime.client_request import ClientRequestException  # type: ignore
+from office365.runtime.queries.client_query import ClientQuery  # type: ignore[import-untyped]
+from office365.sharepoint.client_context import ClientContext  # type: ignore[import-untyped]
 from pydantic import BaseModel
 
 from onyx.configs.app_configs import INDEX_BATCH_SIZE
 from onyx.configs.app_configs import SHAREPOINT_CONNECTOR_SIZE_THRESHOLD
 from onyx.configs.constants import DocumentSource
-from onyx.connectors.interfaces import GenerateDocumentsOutput
-from onyx.connectors.interfaces import LoadConnector
-from onyx.connectors.interfaces import PollConnector
+from onyx.configs.constants import FileOrigin
+from onyx.connectors.exceptions import ConnectorValidationError
+from onyx.connectors.interfaces import CheckpointedConnectorWithPermSync
+from onyx.connectors.interfaces import CheckpointOutput
+from onyx.connectors.interfaces import GenerateSlimDocumentOutput
+from onyx.connectors.interfaces import IndexingHeartbeatInterface
 from onyx.connectors.interfaces import SecondsSinceUnixEpoch
+from onyx.connectors.interfaces import SlimConnector
 from onyx.connectors.models import BasicExpertInfo
+from onyx.connectors.models import ConnectorCheckpoint
+from onyx.connectors.models import ConnectorFailure
 from onyx.connectors.models import ConnectorMissingCredentialError
 from onyx.connectors.models import Document
+from onyx.connectors.models import DocumentFailure
+from onyx.connectors.models import EntityFailure
+from onyx.connectors.models import ExternalAccess
+from onyx.connectors.models import ImageSection
+from onyx.connectors.models import SlimDocument
 from onyx.connectors.models import TextSection
+from onyx.connectors.sharepoint.connector_utils import get_sharepoint_external_access
+from onyx.file_processing.extract_file_text import ACCEPTED_IMAGE_FILE_EXTENSIONS
 from onyx.file_processing.extract_file_text import extract_file_text
+from onyx.file_processing.image_utils import store_image_and_create_section
 from onyx.utils.logger import setup_logger
 
-
 logger = setup_logger()
+SLIM_BATCH_SIZE = 1000
 
 
 ASPX_EXTENSION = ".aspx"
@@ -57,7 +82,16 @@ class SiteDescriptor(BaseModel):
     folder_path: str | None
 
 
-def _sleep_and_retry(query_obj: Any, method_name: str, max_retries: int = 3) -> Any:
+class CertificateData(BaseModel):
+    """Data class for storing certificate information loaded from PFX file."""
+
+    private_key: bytes
+    thumbprint: str
+
+
+def sleep_and_retry(
+    query_obj: ClientQuery, method_name: str, max_retries: int = 3
+) -> Any:
     """
     Execute a SharePoint query with retry logic for rate limiting.
     """
@@ -91,13 +125,75 @@ def _sleep_and_retry(query_obj: Any, method_name: str, max_retries: int = 3) ->
                 raise e
 
 
-def _convert_driveitem_to_document(
+class SharepointConnectorCheckpoint(ConnectorCheckpoint):
+    cached_site_descriptors: deque[SiteDescriptor] | None = None
+    current_site_descriptor: SiteDescriptor | None = None
+
+    cached_drive_names: deque[str] | None = None
+    current_drive_name: str | None = None
+
+    process_site_pages: bool = False
+
+
+class SharepointAuthMethod(Enum):
+    CLIENT_SECRET = "client_secret"
+    CERTIFICATE = "certificate"
+
+
+def load_certificate_from_pfx(pfx_data: bytes, password: str) -> CertificateData | None:
+    """Load certificate from .pfx file for MSAL authentication"""
+    try:
+        # Load the certificate and private key
+        private_key, certificate, additional_certificates = (
+            pkcs12.load_key_and_certificates(pfx_data, password.encode("utf-8"))
+        )
+
+        # Validate that certificate and private key are not None
+        if certificate is None or private_key is None:
+            raise ValueError("Certificate or private key is None")
+
+        # Convert to PEM format that MSAL expects
+        key_pem = private_key.private_bytes(
+            encoding=serialization.Encoding.PEM,
+            format=serialization.PrivateFormat.PKCS8,
+            encryption_algorithm=serialization.NoEncryption(),
+        )
+
+        return CertificateData(
+            private_key=key_pem,
+            thumbprint=certificate.fingerprint(hashes.SHA1()).hex(),
+        )
+    except Exception as e:
+        logger.error(f"Error loading certificate: {e}")
+        return None
+
+
+def acquire_token_for_rest(
+    msal_app: msal.ConfidentialClientApplication, sp_tenant_domain: str
+) -> TokenResponse:
+    token = msal_app.acquire_token_for_client(
+        scopes=[f"https://{sp_tenant_domain}.sharepoint.com/.default"]
+    )
+    return TokenResponse.from_json(token)
+
+
+def _convert_driveitem_to_document_with_permissions(
     driveitem: DriveItem,
     drive_name: str,
-) -> Document | None:
-    # Check file size before downloading
+    ctx: ClientContext | None,
+    graph_client: GraphClient,
+    include_permissions: bool = False,
+) -> Document:
+
+    if driveitem.name is None:
+        raise ValueError("DriveItem name is required")
+    if driveitem.id is None:
+        raise ValueError("DriveItem ID is required")
+
     try:
-        size_value = getattr(driveitem, "size", None)
+        # Access size from the JSON representation since it's not exposed as a direct attribute
+        driveitem_json = driveitem.to_json()
+        size_value = driveitem_json.get("size")
         if size_value is not None:
             file_size = int(size_value)
             if file_size > SHAREPOINT_CONNECTOR_SIZE_THRESHOLD:
@@ -105,38 +201,82 @@ def _convert_driveitem_to_document(
                     f"File '{driveitem.name}' exceeds size threshold of {SHAREPOINT_CONNECTOR_SIZE_THRESHOLD} bytes. "
                     f"File size: {file_size} bytes. Skipping."
                 )
-                return None
+                raise ValueError(
+                    f"File '{driveitem.name}' exceeds size threshold of {SHAREPOINT_CONNECTOR_SIZE_THRESHOLD} bytes. "
+                    f"File size: {file_size} bytes."
+                )
         else:
             logger.warning(
                 f"Could not access file size for '{driveitem.name}' Proceeding with download."
             )
-    except (ValueError, TypeError, AttributeError) as e:
+    except (ValueError, TypeError, AttributeError, KeyError) as e:
         logger.info(
             f"Could not access file size for '{driveitem.name}': {e}. Proceeding with download."
         )
+    if include_permissions and ctx is None:
+        raise ValueError("ClientContext is required for permissions")
 
     # Proceed with download if size is acceptable or not available
-    content = _sleep_and_retry(driveitem.get_content(), "get_content")
+    content = sleep_and_retry(driveitem.get_content(), "get_content")
+
     if content is None:
         logger.warning(f"Could not access content for '{driveitem.name}'")
-        return None
-
-    file_text = extract_file_text(
-        file=io.BytesIO(content.value),
-        file_name=driveitem.name,
-        break_on_unprocessable=False,
-    )
+        raise ValueError(f"Could not access content for '{driveitem.name}'")
+
+    # Handle different content types
+    if isinstance(content.value, bytes):
+        content_bytes = content.value
+    else:
+        raise ValueError(f"Unsupported content type: {type(content.value)}")
+
+    sections: list[TextSection | ImageSection] = []
+    file_ext = driveitem.name.split(".")[-1]
+
+    if "." + file_ext in ACCEPTED_IMAGE_FILE_EXTENSIONS:
+        image_section, _ = store_image_and_create_section(
+            image_data=content_bytes,
+            file_id=driveitem.id,
+            display_name=driveitem.name,
+            file_origin=FileOrigin.CONNECTOR,
+        )
+        image_section.link = driveitem.web_url
+        sections.append(image_section)
+    else:
+        file_text = extract_file_text(
+            file=io.BytesIO(content_bytes),
+            file_name=driveitem.name,
+            break_on_unprocessable=False,
+        )
+        sections.append(TextSection(link=driveitem.web_url, text=file_text))
+
+    if include_permissions and ctx is not None:
+        logger.info(f"Getting external access for {driveitem.name}")
+        external_access = get_sharepoint_external_access(
+            ctx=ctx,
+            graph_client=graph_client,
+            drive_item=driveitem,
+            drive_name=drive_name,
+            add_prefix=True,
+        )
+    else:
+        external_access = ExternalAccess.empty()
 
     doc = Document(
         id=driveitem.id,
-        sections=[TextSection(link=driveitem.web_url, text=file_text)],
+        sections=sections,
         source=DocumentSource.SHAREPOINT,
         semantic_identifier=driveitem.name,
-        doc_updated_at=driveitem.last_modified_datetime.replace(tzinfo=timezone.utc),
+        external_access=external_access,
+        doc_updated_at=(
+            driveitem.last_modified_datetime.replace(tzinfo=timezone.utc)
+            if driveitem.last_modified_datetime
+            else None
+        ),
         primary_owners=[
             BasicExpertInfo(
                 display_name=driveitem.last_modified_by.user.displayName,
-                email=driveitem.last_modified_by.user.email,
+                email=getattr(driveitem.last_modified_by.user, "email", "")
+                or getattr(driveitem.last_modified_by.user, "userPrincipalName", ""),
             )
         ],
         metadata={"drive": drive_name},
@@ -145,12 +285,15 @@ def _convert_driveitem_to_document(
 
 
 def _convert_sitepage_to_document(
-    site_page: dict[str, Any], site_name: str | None
+    site_page: dict[str, Any],
+    site_name: str | None,
+    ctx: ClientContext | None,
+    graph_client: GraphClient,
+    include_permissions: bool = False,
 ) -> Document:
     """Convert a SharePoint site page to a Document object."""
     # Extract text content from the site page
     page_text = ""
-
     # Get title and description
     title = cast(str, site_page.get("title", ""))
     description = cast(str, site_page.get("description", ""))
@@ -270,10 +413,21 @@ def _convert_sitepage_to_document(
     if semantic_identifier.endswith(ASPX_EXTENSION):
         semantic_identifier = semantic_identifier[: -len(ASPX_EXTENSION)]
 
+    if include_permissions:
+        external_access = get_sharepoint_external_access(
+            ctx=ctx,
+            graph_client=graph_client,
+            site_page=site_page,
+            add_prefix=True,
+        )
+    else:
+        external_access = ExternalAccess.empty()
+
     doc = Document(
         id=site_page["id"],
         sections=[TextSection(link=web_url, text=page_text)],
         source=DocumentSource.SHAREPOINT,
+        external_access=external_access,
         semantic_identifier=semantic_identifier,
         doc_updated_at=last_modified_datetime or created_datetime,
         primary_owners=primary_owners,
@@ -288,7 +442,53 @@ def _convert_sitepage_to_document(
     return doc
 
 
-class SharepointConnector(LoadConnector, PollConnector):
+def _convert_driveitem_to_slim_document(
+    driveitem: DriveItem,
+    drive_name: str,
+    ctx: ClientContext,
+    graph_client: GraphClient,
+) -> SlimDocument:
+    if driveitem.id is None:
+        raise ValueError("DriveItem ID is required")
+
+    external_access = get_sharepoint_external_access(
+        ctx=ctx,
+        graph_client=graph_client,
+        drive_item=driveitem,
+        drive_name=drive_name,
+    )
+
+    return SlimDocument(
+        id=driveitem.id,
+        external_access=external_access,
+    )
+
+
+def _convert_sitepage_to_slim_document(
+    site_page: dict[str, Any], ctx: ClientContext | None, graph_client: GraphClient
+) -> SlimDocument:
+    """Convert a SharePoint site page to a SlimDocument object."""
+    if site_page.get("id") is None:
+        raise ValueError("Site page ID is required")
+
+    external_access = get_sharepoint_external_access(
+        ctx=ctx,
+        graph_client=graph_client,
+        site_page=site_page,
+    )
+    id = site_page.get("id")
+    if id is None:
+        raise ValueError("Site page ID is required")
+    return SlimDocument(
+        id=id,
+        external_access=external_access,
+    )
+
+
+class SharepointConnector(
+    SlimConnector,
+    CheckpointedConnectorWithPermSync[SharepointConnectorCheckpoint],
+):
     def __init__(
         self,
         batch_size: int = INDEX_BATCH_SIZE,
@@ -302,6 +502,7 @@ def __init__(
         )
         self.msal_app: msal.ConfidentialClientApplication | None = None
         self.include_site_pages = include_site_pages
+        self.sp_tenant_domain: str | None = None
 
     @property
     def graph_client(self) -> GraphClient:
@@ -341,6 +542,102 @@ def _extract_site_and_drive_info(site_urls: list[str]) -> list[SiteDescriptor]:
                 )
         return site_data_list
 
+    def _get_drive_items_for_drive_name(
+        self,
+        site_descriptor: SiteDescriptor,
+        drive_name: str,
+        start: datetime | None = None,
+        end: datetime | None = None,
+    ) -> list[DriveItem]:
+        try:
+            site = self.graph_client.sites.get_by_url(site_descriptor.url)
+            drives = site.drives.get().execute_query()
+            logger.debug(f"Found drives: {[drive.name for drive in drives]}")
+
+            drives = [
+                drive
+                for drive in drives
+                if (drive.name and drive.name.lower() == drive_name.lower())
+                or (drive.name == "Documents" and drive_name == "Shared Documents")
+            ]
+            drive = drives[0] if len(drives) > 0 else None
+            if drive is None:
+                logger.warning(f"Drive '{drive_name}' not found")
+                return []
+            try:
+                root_folder = drive.root
+                if site_descriptor.folder_path:
+                    for folder_part in site_descriptor.folder_path.split("/"):
+                        root_folder = root_folder.get_by_path(folder_part)
+
+                query = root_folder.get_files(
+                    recursive=True,
+                    page_size=1000,
+                )
+                driveitems = query.execute_query()
+                logger.debug(f"Found {len(driveitems)} items in drive '{drive_name}'")
+
+                # Filter items based on folder path if specified
+                if site_descriptor.folder_path:
+                    # Filter items to ensure they're in the specified folder or its subfolders
+                    # The path will be in format: /drives/{drive_id}/root:/folder/path
+                    driveitems = [
+                        item
+                        for item in driveitems
+                        if item.parent_reference.path
+                        and any(
+                            path_part == site_descriptor.folder_path
+                            or path_part.startswith(site_descriptor.folder_path + "/")
+                            for path_part in item.parent_reference.path.split("root:/")[
+                                1
+                            ].split("/")
+                        )
+                    ]
+                    if len(driveitems) == 0:
+                        all_paths = [item.parent_reference.path for item in driveitems]
+                        logger.warning(
+                            f"Nothing found for folder '{site_descriptor.folder_path}' "
+                            f"in; any of valid paths: {all_paths}"
+                        )
+                    logger.info(
+                        f"Found {len(driveitems)} items in drive '{drive_name}' for the folder '{site_descriptor.folder_path}'"
+                    )
+
+                # Filter items based on time window if specified
+                if start is not None and end is not None:
+                    driveitems = [
+                        item
+                        for item in driveitems
+                        if item.last_modified_datetime
+                        and start
+                        <= item.last_modified_datetime.replace(tzinfo=timezone.utc)
+                        <= end
+                    ]
+                    logger.debug(
+                        f"Found {len(driveitems)} items within time window in drive '{drive.name}'"
+                    )
+
+                return list(driveitems)
+
+            except Exception as e:
+                # Some drives might not be accessible
+                logger.warning(f"Failed to process drive: {str(e)}")
+                return []
+
+        except Exception as e:
+            err_str = str(e)
+            if (
+                "403 Client Error" in err_str
+                or "404 Client Error" in err_str
+                or "invalid_client" in err_str
+            ):
+                raise e
+
+            # Sites include things that do not contain drives so this fails
+            # but this is fine, as there are no actual documents in those
+            logger.warning(f"Failed to process site: {site_descriptor.url} - {err_str}")
+            return []
+
     def _fetch_driveitems(
         self,
         site_descriptor: SiteDescriptor,
@@ -391,7 +688,9 @@ def _fetch_driveitems(
 
                     # Use "Shared Documents" as the library name for the default "Documents" drive
                     drive_name = (
-                        "Shared Documents" if drive.name == "Documents" else drive.name
+                        "Shared Documents"
+                        if drive.name == "Documents"
+                        else cast(str, drive.name)
                     )
 
                     # Filter items based on folder path if specified
@@ -401,7 +700,8 @@ def _fetch_driveitems(
                         driveitems = [
                             item
                             for item in driveitems
-                            if any(
+                            if item.parent_reference.path
+                            and any(
                                 path_part == site_descriptor.folder_path
                                 or path_part.startswith(
                                     site_descriptor.folder_path + "/"
@@ -425,7 +725,8 @@ def _fetch_driveitems(
                         driveitems = [
                             item
                             for item in driveitems
-                            if start
+                            if item.last_modified_datetime
+                            and start
                             <= item.last_modified_datetime.replace(tzinfo=timezone.utc)
                             <= end
                         ]
@@ -434,7 +735,7 @@ def _fetch_driveitems(
                         )
 
                     for item in driveitems:
-                        final_driveitems.append((item, drive_name))
+                        final_driveitems.append((item, drive_name or ""))
 
                 except Exception as e:
                     # Some drives might not be accessible
@@ -465,7 +766,7 @@ def _handle_paginated_sites(
                 break
             sites = sites._get_next().execute_query()
 
-    def _fetch_sites(self) -> list[SiteDescriptor]:
+    def fetch_sites(self) -> list[SiteDescriptor]:
         sites = self.graph_client.sites.get_all_sites().execute_query()
 
         if not sites:
@@ -473,7 +774,7 @@ def _fetch_sites(self) -> list[SiteDescriptor]:
 
         site_descriptors = [
             SiteDescriptor(
-                url=site.web_url,
+                url=site.web_url or "",
                 drive_name=None,
                 folder_path=None,
             )
@@ -549,94 +850,534 @@ def _fetch_site_pages(
 
         return all_pages
 
-    def _fetch_from_sharepoint(
-        self, start: datetime | None = None, end: datetime | None = None
-    ) -> GenerateDocumentsOutput:
-        site_descriptors = self.site_descriptors or self._fetch_sites()
+    def _acquire_token(self) -> dict[str, Any]:
+        """
+        Acquire token via MSAL
+        """
+        if self.msal_app is None:
+            raise RuntimeError("MSAL app is not initialized")
+
+        token = self.msal_app.acquire_token_for_client(
+            scopes=["https://graph.microsoft.com/.default"]
+        )
+        return token
 
-        # goes over all urls, converts them into Document objects and then yields them in batches
-        doc_batch: list[Document] = []
+    def _fetch_slim_documents_from_sharepoint(self) -> GenerateSlimDocumentOutput:
+        site_descriptors = self.site_descriptors or self.fetch_sites()
+
+        # goes over all urls, converts them into SlimDocument objects and then yields them in batches
+        doc_batch: list[SlimDocument] = []
         for site_descriptor in site_descriptors:
-            # Fetch regular documents from document libraries
-            driveitems = self._fetch_driveitems(site_descriptor, start=start, end=end)
+            ctx: ClientContext | None = None
+
+            if self.msal_app and self.sp_tenant_domain:
+                msal_app = self.msal_app
+                sp_tenant_domain = self.sp_tenant_domain
+                ctx = ClientContext(site_descriptor.url).with_access_token(
+                    lambda: acquire_token_for_rest(msal_app, sp_tenant_domain)
+                )
+            else:
+                raise RuntimeError("MSAL app or tenant domain is not set")
+
+            if ctx is None:
+                logger.warning("ClientContext is not set, skipping permissions")
+                continue
+
+            driveitems = self._fetch_driveitems(site_descriptor=site_descriptor)
             for driveitem, drive_name in driveitems:
-                logger.debug(f"Processing: {driveitem.web_url}")
+                try:
+                    logger.debug(f"Processing: {driveitem.web_url}")
+                    doc_batch.append(
+                        _convert_driveitem_to_slim_document(
+                            driveitem, drive_name, ctx, self.graph_client
+                        )
+                    )
+                except Exception as e:
+                    logger.warning(f"Failed to process driveitem: {str(e)}")
 
-                # Convert driveitem to document with size checking
-                doc = _convert_driveitem_to_document(driveitem, drive_name)
-                if doc is not None:
-                    doc_batch.append(doc)
+                if len(doc_batch) >= SLIM_BATCH_SIZE:
+                    yield doc_batch
+                    doc_batch = []
 
-                if len(doc_batch) >= self.batch_size:
+            # Fetch site pages
+            site_pages = self._fetch_site_pages(site_descriptor)
+            for site_page in site_pages:
+                logger.debug(
+                    f"Processing site page: {site_page.get('webUrl', site_page.get('name', 'Unknown'))}"
+                )
+                doc_batch.append(
+                    _convert_sitepage_to_slim_document(
+                        site_page, ctx, self.graph_client
+                    )
+                )
+                if len(doc_batch) >= SLIM_BATCH_SIZE:
                     yield doc_batch
                     doc_batch = []
+        yield doc_batch
 
+    def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
+        auth_method = credentials.get(
+            "authentication_method", SharepointAuthMethod.CLIENT_SECRET.value
+        )
+        sp_client_id = credentials.get("sp_client_id")
+        sp_client_secret = credentials.get("sp_client_secret")
+        sp_directory_id = credentials.get("sp_directory_id")
+        sp_private_key = credentials.get("sp_private_key")
+        sp_certificate_password = credentials.get("sp_certificate_password")
+
+        authority_url = f"https://login.microsoftonline.com/{sp_directory_id}"
+
+        if auth_method == SharepointAuthMethod.CERTIFICATE.value:
+            logger.info("Using certificate authentication")
+            if not sp_private_key or not sp_certificate_password:
+                raise ConnectorValidationError(
+                    "Private key and certificate password are required for certificate authentication"
+                )
+
+            pfx_data = base64.b64decode(sp_private_key)
+            certificate_data = load_certificate_from_pfx(
+                pfx_data, sp_certificate_password
+            )
+            if certificate_data is None:
+                raise RuntimeError("Failed to load certificate")
+
+            self.msal_app = msal.ConfidentialClientApplication(
+                authority=authority_url,
+                client_id=sp_client_id,
+                client_credential=certificate_data.model_dump(),
+            )
+        elif auth_method == SharepointAuthMethod.CLIENT_SECRET.value:
+            logger.info("Using client secret authentication")
+            self.msal_app = msal.ConfidentialClientApplication(
+                authority=authority_url,
+                client_id=sp_client_id,
+                client_credential=sp_client_secret,
+            )
+        else:
+            raise ConnectorValidationError(
+                "Invalid authentication method or missing required credentials"
+            )
+
+        def _acquire_token_for_graph() -> dict[str, Any]:
+            """
+            Acquire token via MSAL
+            """
+            if self.msal_app is None:
+                raise ConnectorValidationError("MSAL app is not initialized")
+
+            token = self.msal_app.acquire_token_for_client(
+                scopes=["https://graph.microsoft.com/.default"]
+            )
+            if token is None:
+                raise ConnectorValidationError("Failed to acquire token for graph")
+            return token
+
+        self._graph_client = GraphClient(_acquire_token_for_graph)
+        if auth_method == SharepointAuthMethod.CERTIFICATE.value:
+            org = self.graph_client.organization.get().execute_query()
+            if not org or len(org) == 0:
+                raise ConnectorValidationError("No organization found")
+
+            tenant_info: Organization = org[
+                0
+            ]  # Access first item directly from collection
+            if not tenant_info.verified_domains:
+                raise ConnectorValidationError("No verified domains found for tenant")
+
+            sp_tenant_domain = tenant_info.verified_domains[0].name
+            if not sp_tenant_domain:
+                raise ConnectorValidationError("No verified domains found for tenant")
+            # remove the .onmicrosoft.com part
+            self.sp_tenant_domain = sp_tenant_domain.split(".")[0]
+        return None
+
+    def _create_document_failure(
+        self,
+        driveitem: DriveItem,
+        error_message: str,
+        exception: Exception | None = None,
+    ) -> ConnectorFailure:
+        """Helper method to create a ConnectorFailure for document processing errors."""
+        return ConnectorFailure(
+            failed_document=DocumentFailure(
+                document_id=driveitem.id or "unknown",
+                document_link=driveitem.web_url,
+            ),
+            failure_message=f"SharePoint document '{driveitem.name or 'unknown'}': {error_message}",
+            exception=exception,
+        )
+
+    def _create_entity_failure(
+        self,
+        entity_id: str,
+        error_message: str,
+        time_range: tuple[datetime, datetime] | None = None,
+        exception: Exception | None = None,
+    ) -> ConnectorFailure:
+        """Helper method to create a ConnectorFailure for entity-level errors."""
+        return ConnectorFailure(
+            failed_entity=EntityFailure(
+                entity_id=entity_id,
+                missed_time_range=time_range,
+            ),
+            failure_message=f"SharePoint entity '{entity_id}': {error_message}",
+            exception=exception,
+        )
+
+    def _get_drive_names_for_site(self, site_url: str) -> list[str]:
+        """Return all library/drive names for a given SharePoint site."""
+        try:
+            site = self.graph_client.sites.get_by_url(site_url)
+            drives = site.drives.get_all(page_loaded=lambda _: None).execute_query()
+            drive_names: list[str] = []
+            for drive in drives:
+                if drive.name is None:
+                    continue
+                drive_names.append(drive.name)
+
+            return drive_names
+        except Exception as e:
+            logger.warning(f"Failed to fetch drives for site '{site_url}': {e}")
+            return []
+
+    def _load_from_checkpoint(
+        self,
+        start: SecondsSinceUnixEpoch,
+        end: SecondsSinceUnixEpoch,
+        checkpoint: SharepointConnectorCheckpoint,
+        include_permissions: bool = False,
+    ) -> CheckpointOutput[SharepointConnectorCheckpoint]:
+
+        if self._graph_client is None:
+            raise ConnectorMissingCredentialError("Sharepoint")
+
+        checkpoint = copy.deepcopy(checkpoint)
+
+        # Phase 1: Initialize cached_site_descriptors if needed
+        if (
+            checkpoint.has_more
+            and checkpoint.cached_site_descriptors is None
+            and not checkpoint.process_site_pages
+        ):
+            logger.info("Initializing SharePoint sites for processing")
+            site_descs = self.site_descriptors or self.fetch_sites()
+            checkpoint.cached_site_descriptors = deque(site_descs)
+
+            if not checkpoint.cached_site_descriptors:
+                logger.warning(
+                    "No SharePoint sites found or accessible - nothing to process"
+                )
+                checkpoint.has_more = False
+                return checkpoint
+
+            logger.info(
+                f"Found {len(checkpoint.cached_site_descriptors)} sites to process"
+            )
+            # Set first site and return to allow checkpoint persistence
+            if checkpoint.cached_site_descriptors:
+                checkpoint.current_site_descriptor = (
+                    checkpoint.cached_site_descriptors.popleft()
+                )
+                logger.info(
+                    f"Starting with site: {checkpoint.current_site_descriptor.url}"
+                )
+                return checkpoint
+
+        # Phase 2: Initialize cached_drive_names for current site if needed
+        if checkpoint.current_site_descriptor and checkpoint.cached_drive_names is None:
+            logger.info(
+                f"Initializing drives for site: {checkpoint.current_site_descriptor.url}"
+            )
+
+            try:
+                # If the user explicitly specified drive(s) for this site, honour that
+                if checkpoint.current_site_descriptor.drive_name:
+                    logger.info(
+                        f"Using explicitly specified drive: {checkpoint.current_site_descriptor.drive_name}"
+                    )
+                    checkpoint.cached_drive_names = deque(
+                        [checkpoint.current_site_descriptor.drive_name]
+                    )
+                else:
+                    drive_names = self._get_drive_names_for_site(
+                        checkpoint.current_site_descriptor.url
+                    )
+                    checkpoint.cached_drive_names = deque(drive_names)
+
+                if not checkpoint.cached_drive_names:
+                    logger.warning(
+                        f"No accessible drives found for site: {checkpoint.current_site_descriptor.url}"
+                    )
+                else:
+                    logger.info(
+                        f"Found {len(checkpoint.cached_drive_names)} drives: {list(checkpoint.cached_drive_names)}"
+                    )
+
+            except Exception as e:
+                logger.error(
+                    f"Failed to initialize drives for site: {checkpoint.current_site_descriptor.url}: {e}"
+                )
+                # Yield a ConnectorFailure for site-level access failures
+                start_dt = datetime.fromtimestamp(start, tz=timezone.utc)
+                end_dt = datetime.fromtimestamp(end, tz=timezone.utc)
+                yield self._create_entity_failure(
+                    checkpoint.current_site_descriptor.url,
+                    f"Failed to access site: {str(e)}",
+                    (start_dt, end_dt),
+                    e,
+                )
+                # Move to next site if available
+                if (
+                    checkpoint.cached_site_descriptors
+                    and len(checkpoint.cached_site_descriptors) > 0
+                ):
+                    checkpoint.current_site_descriptor = (
+                        checkpoint.cached_site_descriptors.popleft()
+                    )
+                    checkpoint.cached_drive_names = None  # Reset for new site
+                    return checkpoint
+                else:
+                    # No more sites - we're done
+                    checkpoint.has_more = False
+                    return checkpoint
+
+            # Return checkpoint to allow persistence after drive initialization
+            return checkpoint
+
+        # Phase 3: Process documents from current drive
+        if (
+            checkpoint.current_site_descriptor
+            and checkpoint.cached_drive_names
+            and len(checkpoint.cached_drive_names) > 0
+            and checkpoint.current_drive_name is None
+        ):
+
+            checkpoint.current_drive_name = checkpoint.cached_drive_names.popleft()
+
+            start_dt = datetime.fromtimestamp(start, tz=timezone.utc)
+            end_dt = datetime.fromtimestamp(end, tz=timezone.utc)
+            site_descriptor = checkpoint.current_site_descriptor
+
+            logger.info(
+                f"Processing drive '{checkpoint.current_drive_name}' in site: {site_descriptor.url}"
+            )
+            logger.debug(f"Time range: {start_dt} to {end_dt}")
+
+            ctx: ClientContext | None = None
+            if include_permissions:
+                if self.msal_app and self.sp_tenant_domain:
+                    msal_app = self.msal_app
+                    sp_tenant_domain = self.sp_tenant_domain
+                    ctx = ClientContext(site_descriptor.url).with_access_token(
+                        lambda: acquire_token_for_rest(msal_app, sp_tenant_domain)
+                    )
+                else:
+                    raise RuntimeError("MSAL app or tenant domain is not set")
+
+            # At this point current_drive_name should be set from popleft()
+            current_drive_name = checkpoint.current_drive_name
+            if current_drive_name is None:
+                logger.warning("Current drive name is None, skipping")
+                return checkpoint
+
+            try:
+                driveitems = self._get_drive_items_for_drive_name(
+                    site_descriptor, current_drive_name, start_dt, end_dt
+                )
+
+                if not driveitems:
+                    logger.warning(
+                        f"No drive items found in drive '{current_drive_name}' for site: {site_descriptor.url}"
+                    )
+                else:
+                    logger.info(
+                        f"Found {len(driveitems)} items to process in drive '{current_drive_name}'"
+                    )
+            except Exception as e:
+                logger.error(
+                    f"Failed to retrieve items from drive '{current_drive_name}' in site: {site_descriptor.url}: {e}"
+                )
+                # Yield a ConnectorFailure for drive-level access failures
+                yield self._create_entity_failure(
+                    f"{site_descriptor.url}|{current_drive_name}",
+                    f"Failed to access drive '{current_drive_name}' in site '{site_descriptor.url}': {str(e)}",
+                    (start_dt, end_dt),
+                    e,
+                )
+                # Clear current drive and continue to next
+                checkpoint.current_drive_name = None
+                return checkpoint
+            current_drive_name = (
+                "Shared Documents"
+                if current_drive_name == "Documents"
+                else current_drive_name
+            )
+            for driveitem in driveitems:
+                try:
+                    doc = _convert_driveitem_to_document_with_permissions(
+                        driveitem,
+                        current_drive_name,
+                        ctx,
+                        self.graph_client,
+                        include_permissions=include_permissions,
+                    )
+                    yield doc
+                except Exception as e:
+                    logger.warning(
+                        f"Failed to process driveitem {driveitem.web_url}: {e}"
+                    )
+                    # Yield a ConnectorFailure for individual document processing failures
+                    yield self._create_document_failure(
+                        driveitem, f"Failed to process: {str(e)}", e
+                    )
+
+            # Clear current drive after processing
+            checkpoint.current_drive_name = None
+
+        # Phase 4: Progression logic - determine next step
+        # If we have more drives in current site, continue with current site
+        if checkpoint.cached_drive_names and len(checkpoint.cached_drive_names) > 0:
+            logger.debug(
+                f"Continuing with {len(checkpoint.cached_drive_names)} remaining drives in current site"
+            )
+            return checkpoint
+
+        if (
+            self.include_site_pages
+            and not checkpoint.process_site_pages
+            and checkpoint.current_site_descriptor is not None
+        ):
+            logger.info(
+                f"Processing site pages for site: {checkpoint.current_site_descriptor.url}"
+            )
+            checkpoint.process_site_pages = True
+            return checkpoint
+
+        # Phase 5: Process site pages
+        if (
+            checkpoint.process_site_pages
+            and checkpoint.current_site_descriptor is not None
+        ):
             # Fetch SharePoint site pages (.aspx files)
             # Only fetch site pages if a folder is not specified since this processing
             # happens at a site-wide level + specifying a folder implies that the
             # user probably isn't looking for site pages
+            site_descriptor = checkpoint.current_site_descriptor
             specified_path = (
                 site_descriptor.folder_path is not None
                 or site_descriptor.drive_name is not None
             )
-            if self.include_site_pages and not specified_path:
+            start_dt = datetime.fromtimestamp(start, tz=timezone.utc)
+            end_dt = datetime.fromtimestamp(end, tz=timezone.utc)
+            if not specified_path:
                 site_pages = self._fetch_site_pages(
-                    site_descriptor, start=start, end=end
+                    site_descriptor, start=start_dt, end=end_dt
                 )
+                client_ctx: ClientContext | None = None
+                if include_permissions:
+                    if self.msal_app and self.sp_tenant_domain:
+                        msal_app = self.msal_app
+                        sp_tenant_domain = self.sp_tenant_domain
+                        client_ctx = ClientContext(
+                            site_descriptor.url
+                        ).with_access_token(
+                            lambda: acquire_token_for_rest(msal_app, sp_tenant_domain)
+                        )
+                    else:
+                        raise RuntimeError("MSAL app or tenant domain is not set")
                 for site_page in site_pages:
                     logger.debug(
                         f"Processing site page: {site_page.get('webUrl', site_page.get('name', 'Unknown'))}"
                     )
-                    doc_batch.append(
+                    yield (
                         _convert_sitepage_to_document(
-                            site_page, site_descriptor.drive_name
+                            site_page,
+                            site_descriptor.drive_name,
+                            client_ctx,
+                            self.graph_client,
+                            include_permissions=include_permissions,
                         )
                     )
+            logger.info(
+                f"Finished processing site pages for site: {site_descriptor.url}"
+            )
 
-                    if len(doc_batch) >= self.batch_size:
-                        yield doc_batch
-                        doc_batch = []
+        # If no more drives, move to next site if available
+        if (
+            checkpoint.cached_site_descriptors
+            and len(checkpoint.cached_site_descriptors) > 0
+        ):
+            current_site = (
+                checkpoint.current_site_descriptor.url
+                if checkpoint.current_site_descriptor
+                else "unknown"
+            )
+            checkpoint.current_site_descriptor = (
+                checkpoint.cached_site_descriptors.popleft()
+            )
+            checkpoint.cached_drive_names = None  # Reset for new site
+            checkpoint.process_site_pages = False
+            logger.info(
+                f"Finished site '{current_site}', moving to next site: {checkpoint.current_site_descriptor.url}"
+            )
+            logger.info(
+                f"Remaining sites to process: {len(checkpoint.cached_site_descriptors) + 1}"
+            )
+            return checkpoint
 
-        yield doc_batch
+        # No more sites or drives - we're done
+        current_site = (
+            checkpoint.current_site_descriptor.url
+            if checkpoint.current_site_descriptor
+            else "unknown"
+        )
+        logger.info(
+            f"SharePoint processing complete. Finished last site: {current_site}"
+        )
+        checkpoint.has_more = False
+        return checkpoint
 
-    def _acquire_token(self) -> dict[str, Any]:
-        """
-        Acquire token via MSAL
-        """
-        if self.msal_app is None:
-            raise RuntimeError("MSAL app is not initialized")
+    def load_from_checkpoint(
+        self,
+        start: SecondsSinceUnixEpoch,
+        end: SecondsSinceUnixEpoch,
+        checkpoint: SharepointConnectorCheckpoint,
+    ) -> CheckpointOutput[SharepointConnectorCheckpoint]:
+        return self._load_from_checkpoint(
+            start, end, checkpoint, include_permissions=False
+        )
 
-        token = self.msal_app.acquire_token_for_client(
-            scopes=["https://graph.microsoft.com/.default"]
+    def load_from_checkpoint_with_perm_sync(
+        self,
+        start: SecondsSinceUnixEpoch,
+        end: SecondsSinceUnixEpoch,
+        checkpoint: SharepointConnectorCheckpoint,
+    ) -> CheckpointOutput[SharepointConnectorCheckpoint]:
+        return self._load_from_checkpoint(
+            start, end, checkpoint, include_permissions=True
         )
-        return token
 
-    def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
-        sp_client_id = credentials["sp_client_id"]
-        sp_client_secret = credentials["sp_client_secret"]
-        sp_directory_id = credentials["sp_directory_id"]
+    def build_dummy_checkpoint(self) -> SharepointConnectorCheckpoint:
+        return SharepointConnectorCheckpoint(has_more=True)
 
-        authority_url = f"https://login.microsoftonline.com/{sp_directory_id}"
-        self.msal_app = msal.ConfidentialClientApplication(
-            authority=authority_url,
-            client_id=sp_client_id,
-            client_credential=sp_client_secret,
-        )
-        self._graph_client = GraphClient(self._acquire_token)
-        return None
+    def validate_checkpoint_json(
+        self, checkpoint_json: str
+    ) -> SharepointConnectorCheckpoint:
+        return SharepointConnectorCheckpoint.model_validate_json(checkpoint_json)
 
-    def load_from_state(self) -> GenerateDocumentsOutput:
-        return self._fetch_from_sharepoint()
+    def retrieve_all_slim_documents(
+        self,
+        start: SecondsSinceUnixEpoch | None = None,
+        end: SecondsSinceUnixEpoch | None = None,
+        callback: IndexingHeartbeatInterface | None = None,
+    ) -> GenerateSlimDocumentOutput:
 
-    def poll_source(
-        self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
-    ) -> GenerateDocumentsOutput:
-        start_datetime = datetime.fromtimestamp(start, timezone.utc)
-        end_datetime = datetime.fromtimestamp(end, timezone.utc)
-        return self._fetch_from_sharepoint(start=start_datetime, end=end_datetime)
+        yield from self._fetch_slim_documents_from_sharepoint()
 
 
 if __name__ == "__main__":
+    from onyx.connectors.connector_runner import ConnectorRunner
+
     connector = SharepointConnector(sites=os.environ["SHAREPOINT_SITES"].split(","))
 
     connector.load_credentials(
@@ -646,5 +1387,28 @@ def poll_source(
             "sp_directory_id": os.environ["SHAREPOINT_CLIENT_DIRECTORY_ID"],
         }
     )
-    document_batches = connector.load_from_state()
-    print(next(document_batches))
+
+    # Create a time range from epoch to now
+    end_time = datetime.now(timezone.utc)
+    start_time = datetime.fromtimestamp(0, tz=timezone.utc)
+    time_range = (start_time, end_time)
+
+    # Initialize the runner with a batch size of 10
+    runner: ConnectorRunner[SharepointConnectorCheckpoint] = ConnectorRunner(
+        connector, batch_size=10, include_permissions=False, time_range=time_range
+    )
+
+    # Get initial checkpoint
+    checkpoint = connector.build_dummy_checkpoint()
+
+    # Run the connector
+    while checkpoint.has_more:
+        for doc_batch, failure, next_checkpoint in runner.run(checkpoint):
+            if doc_batch:
+                print(f"Retrieved batch of {len(doc_batch)} documents")
+                for doc in doc_batch:
+                    print(f"Document: {doc.semantic_identifier}")
+            if failure:
+                print(f"Failure: {failure.failure_message}")
+            if next_checkpoint:
+                checkpoint = next_checkpoint
diff --git a/backend/onyx/connectors/sharepoint/connector_utils.py b/backend/onyx/connectors/sharepoint/connector_utils.py
new file mode 100644
index 00000000000..151b069c454
--- /dev/null
+++ b/backend/onyx/connectors/sharepoint/connector_utils.py
@@ -0,0 +1,38 @@
+from typing import Any
+
+from office365.graph_client import GraphClient  # type: ignore[import-untyped]
+from office365.onedrive.driveitems.driveItem import DriveItem  # type: ignore[import-untyped]
+from office365.sharepoint.client_context import ClientContext  # type: ignore[import-untyped]
+
+from onyx.connectors.models import ExternalAccess
+from onyx.utils.variable_functionality import (
+    fetch_versioned_implementation_with_fallback,
+)
+
+
+def get_sharepoint_external_access(
+    ctx: ClientContext,
+    graph_client: GraphClient,
+    drive_item: DriveItem | None = None,
+    drive_name: str | None = None,
+    site_page: dict[str, Any] | None = None,
+    add_prefix: bool = False,
+) -> ExternalAccess:
+    if drive_item and drive_item.id is None:
+        raise ValueError("DriveItem ID is required")
+
+    # Get external access using the EE implementation
+    def noop_fallback(*args: Any, **kwargs: Any) -> ExternalAccess:
+        return ExternalAccess.empty()
+
+    get_external_access_func = fetch_versioned_implementation_with_fallback(
+        "onyx.external_permissions.sharepoint.permission_utils",
+        "get_external_access_from_sharepoint",
+        fallback=noop_fallback,
+    )
+
+    external_access = get_external_access_func(
+        ctx, graph_client, drive_name, drive_item, site_page, add_prefix
+    )
+
+    return external_access
diff --git a/backend/onyx/server/documents/credential.py b/backend/onyx/server/documents/credential.py
index 595b88b36e4..3a806b4e818 100644
--- a/backend/onyx/server/documents/credential.py
+++ b/backend/onyx/server/documents/credential.py
@@ -1,7 +1,12 @@
+import json
+
 from fastapi import APIRouter
 from fastapi import Depends
+from fastapi import File
+from fastapi import Form
 from fastapi import HTTPException
 from fastapi import Query
+from fastapi import UploadFile
 from sqlalchemy.orm import Session
 
 from onyx.auth.users import current_admin_user
@@ -27,6 +32,9 @@
 from onyx.server.documents.models import CredentialSnapshot
 from onyx.server.documents.models import CredentialSwapRequest
 from onyx.server.documents.models import ObjectCreationIdResponse
+from onyx.server.documents.private_key_types import FILE_TYPE_TO_FILE_PROCESSOR
+from onyx.server.documents.private_key_types import PrivateKeyFileTypes
+from onyx.server.documents.private_key_types import ProcessPrivateKeyFileProtocol
 from onyx.server.models import StatusResponse
 from onyx.utils.logger import setup_logger
 from onyx.utils.variable_functionality import fetch_ee_implementation_or_noop
@@ -76,6 +84,7 @@ def get_cc_source_full_info(
         document_source=source_type,
         get_editable=get_editable,
     )
+
     return [
         CredentialSnapshot.from_credential_db_model(credential)
         for credential in credentials
@@ -149,6 +158,70 @@ def create_credential_from_model(
     )
 
 
+@router.post("/credential/private-key")
+def create_credential_with_private_key(
+    credential_json: str = Form(...),
+    admin_public: bool = Form(False),
+    curator_public: bool = Form(False),
+    groups: list[int] = Form([]),
+    name: str | None = Form(None),
+    source: str = Form(...),
+    user: User | None = Depends(current_curator_or_admin_user),
+    uploaded_file: UploadFile = File(...),
+    field_key: str = Form(...),
+    type_definition_key: str = Form(...),
+    db_session: Session = Depends(get_session),
+) -> ObjectCreationIdResponse:
+    try:
+        credential_data = json.loads(credential_json)
+    except json.JSONDecodeError as e:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Invalid JSON in credential_json: {str(e)}",
+        )
+
+    private_key_processor: ProcessPrivateKeyFileProtocol | None = (
+        FILE_TYPE_TO_FILE_PROCESSOR.get(PrivateKeyFileTypes(type_definition_key))
+    )
+    if private_key_processor is None:
+        raise HTTPException(
+            status_code=400,
+            detail="Invalid type definition key for private key file",
+        )
+    private_key_content: str = private_key_processor(uploaded_file)
+
+    credential_data[field_key] = private_key_content
+
+    credential_info = CredentialBase(
+        credential_json=credential_data,
+        admin_public=admin_public,
+        curator_public=curator_public,
+        groups=groups,
+        name=name,
+        source=DocumentSource(source),
+    )
+
+    if not _ignore_credential_permissions(DocumentSource(source)):
+        fetch_ee_implementation_or_noop(
+            "onyx.db.user_group", "validate_object_creation_for_user", None
+        )(
+            db_session=db_session,
+            user=user,
+            target_group_ids=groups,
+            object_is_public=curator_public,
+        )
+
+    # Temporary fix for empty Google App credentials
+    if DocumentSource(source) == DocumentSource.GMAIL:
+        cleanup_gmail_credentials(db_session=db_session)
+
+    credential = create_credential(credential_info, user, db_session)
+    return ObjectCreationIdResponse(
+        id=credential.id,
+        credential=CredentialSnapshot.from_credential_db_model(credential),
+    )
+
+
 """Endpoints for all"""
 
 
@@ -209,6 +282,53 @@ def update_credential_data(
     return CredentialSnapshot.from_credential_db_model(credential)
 
 
+@router.put("/admin/credential/private-key/{credential_id}")
+def update_credential_private_key(
+    credential_id: int,
+    name: str = Form(...),
+    credential_json: str = Form(...),
+    uploaded_file: UploadFile = File(...),
+    field_key: str = Form(...),
+    type_definition_key: str = Form(...),
+    user: User = Depends(current_user),
+    db_session: Session = Depends(get_session),
+) -> CredentialBase:
+    try:
+        credential_data = json.loads(credential_json)
+    except json.JSONDecodeError as e:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Invalid JSON in credential_json: {str(e)}",
+        )
+
+    private_key_processor: ProcessPrivateKeyFileProtocol | None = (
+        FILE_TYPE_TO_FILE_PROCESSOR.get(PrivateKeyFileTypes(type_definition_key))
+    )
+    if private_key_processor is None:
+        raise HTTPException(
+            status_code=400,
+            detail="Invalid type definition key for private key file",
+        )
+    private_key_content: str = private_key_processor(uploaded_file)
+    credential_data[field_key] = private_key_content
+
+    credential = alter_credential(
+        credential_id,
+        name,
+        credential_data,
+        user,
+        db_session,
+    )
+
+    if credential is None:
+        raise HTTPException(
+            status_code=401,
+            detail=f"Credential {credential_id} does not exist or does not belong to user",
+        )
+
+    return CredentialSnapshot.from_credential_db_model(credential)
+
+
 @router.patch("/credential/{credential_id}")
 def update_credential_from_model(
     credential_id: int,
diff --git a/backend/onyx/server/documents/document_utils.py b/backend/onyx/server/documents/document_utils.py
new file mode 100644
index 00000000000..168e87692ca
--- /dev/null
+++ b/backend/onyx/server/documents/document_utils.py
@@ -0,0 +1,75 @@
+from cryptography.hazmat.primitives.serialization import pkcs12
+
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+def _is_password_related_error(error: Exception) -> bool:
+    """
+    Check if the exception indicates a password-related issue rather than a format issue.
+    """
+    error_msg = str(error).lower()
+    password_keywords = ["mac", "integrity", "password", "authentication", "verify"]
+    return any(keyword in error_msg for keyword in password_keywords)
+
+
+def validate_pkcs12_content(file_bytes: bytes) -> bool:
+    """
+    Validate that the file content is actually a PKCS#12 file.
+    This performs basic format validation without requiring passwords.
+    """
+    try:
+        # Basic file size check
+        if len(file_bytes) < 10:
+            logger.debug("File too small to be a valid PKCS#12 file")
+            return False
+
+        # Check for PKCS#12 magic bytes/ASN.1 structure
+        # PKCS#12 files start with ASN.1 SEQUENCE tag (0x30)
+        if file_bytes[0] != 0x30:
+            logger.debug("File does not start with ASN.1 SEQUENCE tag")
+            return False
+
+        # Try to parse the outer ASN.1 structure without password validation
+        # This checks if the file has the basic PKCS#12 structure
+        try:
+            # Attempt to load just to validate the basic format
+            # We expect this to fail due to password, but it should fail with a specific error
+            pkcs12.load_key_and_certificates(file_bytes, password=None)
+            return True
+        except ValueError as e:
+            # Check if the error is related to password (expected) vs format issues
+            if _is_password_related_error(e):
+                # These errors indicate the file format is correct but password is wrong/missing
+                logger.debug(
+                    f"PKCS#12 format appears valid, password-related error: {e}"
+                )
+                return True
+            else:
+                # Other ValueError likely indicates format issues
+                logger.debug(f"PKCS#12 format validation failed: {e}")
+                return False
+        except Exception as e:
+            # Try with empty password as fallback
+            try:
+                pkcs12.load_key_and_certificates(file_bytes, password=b"")
+                return True
+            except ValueError as e2:
+                if _is_password_related_error(e2):
+                    logger.debug(
+                        f"PKCS#12 format appears valid with empty password attempt: {e2}"
+                    )
+                    return True
+                else:
+                    logger.debug(
+                        f"PKCS#12 validation failed on both attempts: {e}, {e2}"
+                    )
+                    return False
+            except Exception:
+                logger.debug(f"PKCS#12 validation failed: {e}")
+                return False
+
+    except Exception as e:
+        logger.debug(f"Unexpected error during PKCS#12 validation: {e}")
+        return False
diff --git a/backend/onyx/server/documents/private_key_types.py b/backend/onyx/server/documents/private_key_types.py
new file mode 100644
index 00000000000..aa8e26b3258
--- /dev/null
+++ b/backend/onyx/server/documents/private_key_types.py
@@ -0,0 +1,57 @@
+import base64
+from enum import Enum
+from typing import Protocol
+
+from fastapi import HTTPException
+from fastapi import UploadFile
+
+from onyx.server.documents.document_utils import validate_pkcs12_content
+
+
+class ProcessPrivateKeyFileProtocol(Protocol):
+    def __call__(self, file: UploadFile) -> str:
+        """
+        Accepts a file-like object, validates the file (e.g., checks extension and content),
+        and returns its contents as a base64-encoded string if valid.
+        Raises an exception if validation fails.
+        """
+        ...
+
+
+class PrivateKeyFileTypes(Enum):
+    SHAREPOINT_PFX_FILE = "sharepoint_pfx_file"
+
+
+def process_sharepoint_private_key_file(file: UploadFile) -> str:
+    """
+    Process and validate a private key file upload.
+
+    Validates both the file extension and file content to ensure it's a valid PKCS#12 file.
+    Content validation prevents attacks that rely on file extension spoofing.
+    """
+    # First check file extension (basic filter)
+    if not (file.filename and file.filename.lower().endswith(".pfx")):
+        raise HTTPException(
+            status_code=400, detail="Invalid file type. Only .pfx files are supported."
+        )
+
+    # Read file content for validation and processing
+    private_key_bytes = file.file.read()
+
+    # Validate file content to prevent extension spoofing attacks
+    if not validate_pkcs12_content(private_key_bytes):
+        raise HTTPException(
+            status_code=400,
+            detail="Invalid file content. The uploaded file does not appear to be a valid PKCS#12 (.pfx) file.",
+        )
+
+    # Convert to base64 if validation passes
+    pfx_64 = base64.b64encode(private_key_bytes).decode("ascii")
+    return pfx_64
+
+
+FILE_TYPE_TO_FILE_PROCESSOR: dict[
+    PrivateKeyFileTypes, ProcessPrivateKeyFileProtocol
+] = {
+    PrivateKeyFileTypes.SHAREPOINT_PFX_FILE: process_sharepoint_private_key_file,
+}
diff --git a/backend/tests/daily/connectors/sharepoint/test_sharepoint_connector.py b/backend/tests/daily/connectors/sharepoint/test_sharepoint_connector.py
index c76f1236316..b45cd6c19de 100644
--- a/backend/tests/daily/connectors/sharepoint/test_sharepoint_connector.py
+++ b/backend/tests/daily/connectors/sharepoint/test_sharepoint_connector.py
@@ -1,14 +1,18 @@
 import os
+import time
 from dataclasses import dataclass
 from datetime import datetime
 from datetime import timezone
 from unittest.mock import MagicMock
+from unittest.mock import patch
 
 import pytest
 
 from onyx.configs.constants import DocumentSource
 from onyx.connectors.models import Document
+from onyx.connectors.models import ImageSection
 from onyx.connectors.sharepoint.connector import SharepointConnector
+from tests.daily.connectors.utils import load_all_docs_from_checkpoint_connector
 
 
 @dataclass
@@ -76,6 +80,17 @@ def find_document(documents: list[Document], semantic_identifier: str) -> Docume
     return matching_docs[0]
 
 
+@pytest.fixture
+def mock_store_image() -> MagicMock:
+    """Mock store_image_and_create_section to return a predefined ImageSection."""
+    mock = MagicMock()
+    mock.return_value = (
+        ImageSection(image_file_id="mocked-file-id", link="https://example.com/image"),
+        "mocked-file-id",
+    )
+    return mock
+
+
 @pytest.fixture
 def sharepoint_credentials() -> dict[str, str]:
     return {
@@ -87,175 +102,219 @@ def sharepoint_credentials() -> dict[str, str]:
 
 def test_sharepoint_connector_all_sites__docs_only(
     mock_get_unstructured_api_key: MagicMock,
+    mock_store_image: MagicMock,
     sharepoint_credentials: dict[str, str],
 ) -> None:
-    # Initialize connector with no sites
-    connector = SharepointConnector(include_site_pages=False)
-
-    # Load credentials
-    connector.load_credentials(sharepoint_credentials)
-
-    # Not asserting expected sites because that can change in test tenant at any time
-    # Finding any docs is good enough to verify that the connector is working
-    document_batches = list(connector.load_from_state())
-    assert document_batches, "Should find documents from all sites"
+    with patch(
+        "onyx.connectors.sharepoint.connector.store_image_and_create_section",
+        mock_store_image,
+    ):
+        # Initialize connector with no sites
+        connector = SharepointConnector(include_site_pages=False)
+
+        # Load credentials
+        connector.load_credentials(sharepoint_credentials)
+
+        # Not asserting expected sites because that can change in test tenant at any time
+        # Finding any docs is good enough to verify that the connector is working
+        document_batches = load_all_docs_from_checkpoint_connector(
+            connector=connector,
+            start=0,
+            end=time.time(),
+        )
+        assert document_batches, "Should find documents from all sites"
 
 
 def test_sharepoint_connector_specific_folder(
     mock_get_unstructured_api_key: MagicMock,
+    mock_store_image: MagicMock,
     sharepoint_credentials: dict[str, str],
 ) -> None:
-    # Initialize connector with the test site URL and specific folder
-    connector = SharepointConnector(
-        sites=[os.environ["SHAREPOINT_SITE"] + "/Shared Documents/test"]
-    )
-
-    # Load credentials
-    connector.load_credentials(sharepoint_credentials)
-
-    # Get all documents
-    document_batches = list(connector.load_from_state())
-    found_documents: list[Document] = [
-        doc for batch in document_batches for doc in batch
-    ]
-
-    # Should only find documents in the test folder
-    test_folder_docs = [
-        doc
-        for doc in EXPECTED_DOCUMENTS
-        if doc.folder_path and doc.folder_path.startswith("test")
-    ]
-    assert len(found_documents) == len(
-        test_folder_docs
-    ), "Should only find documents in test folder"
+    with patch(
+        "onyx.connectors.sharepoint.connector.store_image_and_create_section",
+        mock_store_image,
+    ):
+        # Initialize connector with the test site URL and specific folder
+        connector = SharepointConnector(
+            sites=[os.environ["SHAREPOINT_SITE"] + "/Shared Documents/test"]
+        )
+
+        # Load credentials
+        connector.load_credentials(sharepoint_credentials)
+
+        # Get all documents
+        found_documents: list[Document] = load_all_docs_from_checkpoint_connector(
+            connector=connector,
+            start=0,
+            end=time.time(),
+        )
+
+        # Should only find documents in the test folder
+        test_folder_docs = [
+            doc
+            for doc in EXPECTED_DOCUMENTS
+            if doc.folder_path and doc.folder_path.startswith("test")
+        ]
+        assert len(found_documents) == len(
+            test_folder_docs
+        ), "Should only find documents in test folder"
 
-    # Verify each expected document
-    for expected in test_folder_docs:
-        doc = find_document(found_documents, expected.semantic_identifier)
-        verify_document_content(doc, expected)
+        # Verify each expected document
+        for expected in test_folder_docs:
+            doc = find_document(found_documents, expected.semantic_identifier)
+            verify_document_content(doc, expected)
 
 
 def test_sharepoint_connector_root_folder__docs_only(
     mock_get_unstructured_api_key: MagicMock,
+    mock_store_image: MagicMock,
     sharepoint_credentials: dict[str, str],
 ) -> None:
-    # Initialize connector with the base site URL
-    connector = SharepointConnector(
-        sites=[os.environ["SHAREPOINT_SITE"]], include_site_pages=False
-    )
-
-    # Load credentials
-    connector.load_credentials(sharepoint_credentials)
-
-    # Get all documents
-    document_batches = list(connector.load_from_state())
-    found_documents: list[Document] = [
-        doc for batch in document_batches for doc in batch
-    ]
-
-    assert len(found_documents) == len(
-        EXPECTED_DOCUMENTS
-    ), "Should find all documents in main library"
-
-    # Verify each expected document
-    for expected in EXPECTED_DOCUMENTS:
-        doc = find_document(found_documents, expected.semantic_identifier)
-        verify_document_content(doc, expected)
+    with patch(
+        "onyx.connectors.sharepoint.connector.store_image_and_create_section",
+        mock_store_image,
+    ):
+        # Initialize connector with the base site URL
+        connector = SharepointConnector(
+            sites=[os.environ["SHAREPOINT_SITE"]], include_site_pages=False
+        )
+
+        # Load credentials
+        connector.load_credentials(sharepoint_credentials)
+
+        # Get all documents
+        found_documents: list[Document] = load_all_docs_from_checkpoint_connector(
+            connector=connector,
+            start=0,
+            end=time.time(),
+        )
+
+        assert len(found_documents) == len(
+            EXPECTED_DOCUMENTS
+        ), "Should find all documents in main library"
+
+        # Verify each expected document
+        for expected in EXPECTED_DOCUMENTS:
+            doc = find_document(found_documents, expected.semantic_identifier)
+            verify_document_content(doc, expected)
 
 
 def test_sharepoint_connector_other_library(
     mock_get_unstructured_api_key: MagicMock,
+    mock_store_image: MagicMock,
     sharepoint_credentials: dict[str, str],
 ) -> None:
-    # Initialize connector with the other library
-    connector = SharepointConnector(
-        sites=[
-            os.environ["SHAREPOINT_SITE"] + "/Other Library",
+    with patch(
+        "onyx.connectors.sharepoint.connector.store_image_and_create_section",
+        mock_store_image,
+    ):
+        # Initialize connector with the other library
+        connector = SharepointConnector(
+            sites=[
+                os.environ["SHAREPOINT_SITE"] + "/Other Library",
+            ]
+        )
+
+        # Load credentials
+        connector.load_credentials(sharepoint_credentials)
+
+        # Get all documents
+        found_documents: list[Document] = load_all_docs_from_checkpoint_connector(
+            connector=connector,
+            start=0,
+            end=time.time(),
+        )
+        expected_documents: list[ExpectedDocument] = [
+            doc for doc in EXPECTED_DOCUMENTS if doc.library == "Other Library"
         ]
-    )
 
-    # Load credentials
-    connector.load_credentials(sharepoint_credentials)
+        # Should find all documents in `Other Library`
+        assert len(found_documents) == len(
+            expected_documents
+        ), "Should find all documents in `Other Library`"
 
-    # Get all documents
-    document_batches = list(connector.load_from_state())
-    found_documents: list[Document] = [
-        doc for batch in document_batches for doc in batch
-    ]
-    expected_documents: list[ExpectedDocument] = [
-        doc for doc in EXPECTED_DOCUMENTS if doc.library == "Other Library"
-    ]
-
-    # Should find all documents in `Other Library`
-    assert len(found_documents) == len(
-        expected_documents
-    ), "Should find all documents in `Other Library`"
-
-    # Verify each expected document
-    for expected in expected_documents:
-        doc = find_document(found_documents, expected.semantic_identifier)
-        verify_document_content(doc, expected)
+        # Verify each expected document
+        for expected in expected_documents:
+            doc = find_document(found_documents, expected.semantic_identifier)
+            verify_document_content(doc, expected)
 
 
 def test_sharepoint_connector_poll(
     mock_get_unstructured_api_key: MagicMock,
+    mock_store_image: MagicMock,
     sharepoint_credentials: dict[str, str],
 ) -> None:
-    # Initialize connector with the base site URL
-    connector = SharepointConnector(
-        sites=["https://danswerai.sharepoint.com/sites/sharepoint-tests"]
-    )
-
-    # Load credentials
-    connector.load_credentials(sharepoint_credentials)
-
-    # Set time window to only capture test1.docx (modified at 2025-01-28 20:51:42+00:00)
-    start = datetime(2025, 1, 28, 20, 51, 30, tzinfo=timezone.utc)  # 12 seconds before
-    end = datetime(2025, 1, 28, 20, 51, 50, tzinfo=timezone.utc)  # 8 seconds after
-
-    # Get documents within the time window
-    document_batches = list(connector._fetch_from_sharepoint(start=start, end=end))
-    found_documents: list[Document] = [
-        doc for batch in document_batches for doc in batch
-    ]
-
-    # Should only find test1.docx
-    assert len(found_documents) == 1, "Should only find one document in the time window"
-    doc = found_documents[0]
-    assert doc.semantic_identifier == "test1.docx"
-    verify_document_metadata(doc)
-    verify_document_content(
-        doc, [d for d in EXPECTED_DOCUMENTS if d.semantic_identifier == "test1.docx"][0]
-    )
+    with patch(
+        "onyx.connectors.sharepoint.connector.store_image_and_create_section",
+        mock_store_image,
+    ):
+        # Initialize connector with the base site URL
+        connector = SharepointConnector(
+            sites=["https://danswerai.sharepoint.com/sites/sharepoint-tests"]
+        )
+
+        # Load credentials
+        connector.load_credentials(sharepoint_credentials)
+
+        # Set time window to only capture test1.docx (modified at 2025-01-28 20:51:42+00:00)
+        start = datetime(
+            2025, 1, 28, 20, 51, 30, tzinfo=timezone.utc
+        )  # 12 seconds before
+        end = datetime(2025, 1, 28, 20, 51, 50, tzinfo=timezone.utc)  # 8 seconds after
+
+        # Get documents within the time window
+        found_documents: list[Document] = load_all_docs_from_checkpoint_connector(
+            connector=connector,
+            start=start.timestamp(),
+            end=end.timestamp(),
+        )
+
+        # Should only find test1.docx
+        assert (
+            len(found_documents) == 1
+        ), "Should only find one document in the time window"
+        doc = found_documents[0]
+        assert doc.semantic_identifier == "test1.docx"
+        verify_document_metadata(doc)
+        verify_document_content(
+            doc,
+            [d for d in EXPECTED_DOCUMENTS if d.semantic_identifier == "test1.docx"][0],
+        )
 
 
 def test_sharepoint_connector_pages(
     mock_get_unstructured_api_key: MagicMock,
+    mock_store_image: MagicMock,
     sharepoint_credentials: dict[str, str],
 ) -> None:
-    # Initialize connector with the base site URL
-    connector = SharepointConnector(
-        sites=["https://danswerai.sharepoint.com/sites/sharepoint-tests-pages"]
-    )
-
-    # Load credentials
-    connector.load_credentials(sharepoint_credentials)
-
-    # Get documents within the time window
-    document_batches = list(connector.load_from_state())
-    found_documents: list[Document] = [
-        doc for batch in document_batches for doc in batch
-    ]
-
-    # Should only find CollabHome
-    assert len(found_documents) == 1, "Should only find one page"
-    doc = found_documents[0]
-    assert doc.semantic_identifier == "CollabHome"
-    verify_document_metadata(doc)
-    assert len(doc.sections) == 1
-    assert (
-        doc.sections[0].text
-        == """
+    with patch(
+        "onyx.connectors.sharepoint.connector.store_image_and_create_section",
+        mock_store_image,
+    ):
+        # Initialize connector with the base site URL
+        connector = SharepointConnector(
+            sites=["https://danswerai.sharepoint.com/sites/sharepoint-tests-pages"]
+        )
+
+        # Load credentials
+        connector.load_credentials(sharepoint_credentials)
+
+        # Get documents within the time window
+        found_documents = load_all_docs_from_checkpoint_connector(
+            connector=connector,
+            start=0,
+            end=time.time(),
+        )
+
+        # Should only find CollabHome
+        assert len(found_documents) == 1, "Should only find one page"
+        doc = found_documents[0]
+        assert doc.semantic_identifier == "CollabHome"
+        verify_document_metadata(doc)
+        assert len(doc.sections) == 1
+        assert (
+            doc.sections[0].text
+            == """
 # Home
 
 Display recent news.
@@ -282,4 +341,4 @@ def test_sharepoint_connector_pages(
 
 ## Document library
 """.strip()
-    )
+        )
diff --git a/backend/tests/integration/connector_job_tests/sharepoint/conftest.py b/backend/tests/integration/connector_job_tests/sharepoint/conftest.py
new file mode 100644
index 00000000000..381caf721c2
--- /dev/null
+++ b/backend/tests/integration/connector_job_tests/sharepoint/conftest.py
@@ -0,0 +1,113 @@
+import os
+from collections.abc import Generator
+from datetime import datetime
+from datetime import timezone
+
+import pytest
+
+from onyx.configs.constants import DocumentSource
+from onyx.connectors.models import InputType
+from onyx.connectors.sharepoint.connector import SharepointAuthMethod
+from onyx.db.enums import AccessType
+from tests.integration.common_utils.managers.cc_pair import CCPairManager
+from tests.integration.common_utils.managers.connector import ConnectorManager
+from tests.integration.common_utils.managers.credential import CredentialManager
+from tests.integration.common_utils.managers.llm_provider import LLMProviderManager
+from tests.integration.common_utils.managers.user import UserManager
+from tests.integration.common_utils.reset import reset_all
+from tests.integration.common_utils.test_models import DATestCCPair
+from tests.integration.common_utils.test_models import DATestConnector
+from tests.integration.common_utils.test_models import DATestCredential
+from tests.integration.common_utils.test_models import DATestUser
+
+SharepointTestEnvSetupTuple = tuple[
+    DATestUser,  # admin_user
+    DATestUser,  # regular_user_1
+    DATestUser,  # regular_user_2
+    DATestCredential,
+    DATestConnector,
+    DATestCCPair,
+]
+
+
+@pytest.fixture(scope="module")
+def sharepoint_test_env_setup() -> Generator[SharepointTestEnvSetupTuple]:
+    # Reset all data before running the test
+    reset_all()
+    # Required environment variables for SharePoint certificate authentication
+    sp_client_id = os.environ.get("PERM_SYNC_SHAREPOINT_CLIENT_ID")
+    sp_private_key = os.environ.get("PERM_SYNC_SHAREPOINT_PRIVATE_KEY")
+    sp_certificate_password = os.environ.get(
+        "PERM_SYNC_SHAREPOINT_CERTIFICATE_PASSWORD"
+    )
+    sp_directory_id = os.environ.get("PERM_SYNC_SHAREPOINT_DIRECTORY_ID")
+    sharepoint_sites = "https://danswerai.sharepoint.com/sites/Permisisonsync"
+    admin_email = "admin@onyx.app"
+    user1_email = "subash@onyx.app"
+    user2_email = "raunak@onyx.app"
+
+    if not sp_private_key or not sp_certificate_password or not sp_directory_id:
+        pytest.skip("Skipping test because required environment variables are not set")
+
+    # Certificate-based credentials
+    credentials = {
+        "authentication_method": SharepointAuthMethod.CERTIFICATE.value,
+        "sp_client_id": sp_client_id,
+        "sp_private_key": sp_private_key,
+        "sp_certificate_password": sp_certificate_password,
+        "sp_directory_id": sp_directory_id,
+    }
+
+    # Create users
+    admin_user: DATestUser = UserManager.create(email=admin_email)
+    regular_user_1: DATestUser = UserManager.create(email=user1_email)
+    regular_user_2: DATestUser = UserManager.create(email=user2_email)
+
+    # Create LLM provider for search functionality
+    LLMProviderManager.create(user_performing_action=admin_user)
+
+    # Create credential
+    credential: DATestCredential = CredentialManager.create(
+        source=DocumentSource.SHAREPOINT,
+        credential_json=credentials,
+        user_performing_action=admin_user,
+    )
+
+    # Create connector with SharePoint-specific configuration
+    connector: DATestConnector = ConnectorManager.create(
+        name="SharePoint Test",
+        input_type=InputType.POLL,
+        source=DocumentSource.SHAREPOINT,
+        connector_specific_config={
+            "sites": sharepoint_sites.split(","),
+        },
+        access_type=AccessType.SYNC,  # Enable permission sync
+        user_performing_action=admin_user,
+    )
+
+    # Create CC pair with permission sync enabled
+    cc_pair: DATestCCPair = CCPairManager.create(
+        credential_id=credential.id,
+        connector_id=connector.id,
+        access_type=AccessType.SYNC,  # Enable permission sync
+        user_performing_action=admin_user,
+    )
+
+    # Wait for both indexing and permission sync to complete
+    before = datetime.now(tz=timezone.utc)
+    CCPairManager.wait_for_indexing_completion(
+        cc_pair=cc_pair,
+        after=before,
+        user_performing_action=admin_user,
+        timeout=float("inf"),
+    )
+
+    # Wait for permission sync completion specifically
+    CCPairManager.wait_for_sync(
+        cc_pair=cc_pair,
+        after=before,
+        user_performing_action=admin_user,
+        timeout=float("inf"),
+    )
+
+    yield admin_user, regular_user_1, regular_user_2, credential, connector, cc_pair
diff --git a/backend/tests/integration/connector_job_tests/sharepoint/test_sharepoint_permissions.py b/backend/tests/integration/connector_job_tests/sharepoint/test_sharepoint_permissions.py
new file mode 100644
index 00000000000..c7c384ce35c
--- /dev/null
+++ b/backend/tests/integration/connector_job_tests/sharepoint/test_sharepoint_permissions.py
@@ -0,0 +1,205 @@
+from typing import List
+from uuid import UUID
+
+import pytest
+from sqlalchemy.orm import Session
+
+from ee.onyx.access.access import _get_access_for_documents
+from ee.onyx.db.external_perm import fetch_external_groups_for_user
+from onyx.access.utils import prefix_external_group
+from onyx.access.utils import prefix_user_email
+from onyx.configs.constants import PUBLIC_DOC_PAT
+from onyx.db.engine.sql_engine import get_session_with_current_tenant
+from onyx.db.models import User
+from onyx.db.users import fetch_user_by_id
+from onyx.utils.logger import setup_logger
+from tests.integration.common_utils.test_models import DATestCCPair
+from tests.integration.common_utils.test_models import DATestUser
+from tests.integration.connector_job_tests.sharepoint.conftest import (
+    SharepointTestEnvSetupTuple,
+)
+
+logger = setup_logger()
+
+
+def get_user_acl(user: User, db_session: Session) -> set[str]:
+    db_external_groups = (
+        fetch_external_groups_for_user(db_session, user.id) if user else []
+    )
+    prefixed_external_groups = [
+        prefix_external_group(db_external_group.external_user_group_id)
+        for db_external_group in db_external_groups
+    ]
+
+    user_acl = set(prefixed_external_groups)
+    user_acl.update({prefix_user_email(user.email), PUBLIC_DOC_PAT})
+    return user_acl
+
+
+def get_user_document_access_via_acl(
+    test_user: DATestUser, document_ids: List[str], db_session: Session
+) -> List[str]:
+
+    # Get the actual User object from the database
+    user = fetch_user_by_id(db_session, UUID(test_user.id))
+    if not user:
+        logger.error(f"Could not find user with ID {test_user.id}")
+        return []
+
+    user_acl = get_user_acl(user, db_session)
+    logger.info(f"User {user.email} ACL entries: {user_acl}")
+
+    # Get document access information
+    doc_access_map = _get_access_for_documents(document_ids, db_session)
+    logger.info(f"Found access info for {len(doc_access_map)} documents")
+
+    accessible_docs = []
+    for doc_id, doc_access in doc_access_map.items():
+        doc_acl = doc_access.to_acl()
+        logger.info(f"Document {doc_id} ACL: {doc_acl}")
+
+        # Check if user has any matching ACL entry
+        if user_acl.intersection(doc_acl):
+            accessible_docs.append(doc_id)
+            logger.info(f"User {user.email} has access to document {doc_id}")
+        else:
+            logger.info(f"User {user.email} does NOT have access to document {doc_id}")
+
+    return accessible_docs
+
+
+def get_all_connector_documents(
+    cc_pair: DATestCCPair, db_session: Session
+) -> List[str]:
+    from onyx.db.models import DocumentByConnectorCredentialPair
+    from sqlalchemy import select
+
+    stmt = select(DocumentByConnectorCredentialPair.id).where(
+        DocumentByConnectorCredentialPair.connector_id == cc_pair.connector_id,
+        DocumentByConnectorCredentialPair.credential_id == cc_pair.credential_id,
+    )
+
+    result = db_session.execute(stmt)
+    document_ids = [row[0] for row in result.fetchall()]
+    logger.info(
+        f"Found {len(document_ids)} documents for connector {cc_pair.connector_id}"
+    )
+
+    return document_ids
+
+
+def get_documents_by_permission_type(
+    document_ids: List[str], db_session: Session
+) -> List[str]:
+    """
+    Categorize documents by their permission types
+    Returns a dictionary with lists of document IDs for each permission type
+    """
+    doc_access_map = _get_access_for_documents(document_ids, db_session)
+
+    public_docs = []
+
+    for doc_id, doc_access in doc_access_map.items():
+        if doc_access.is_public:
+            public_docs.append(doc_id)
+
+    return public_docs
+
+
+def test_public_documents_accessible_by_all_users(
+    sharepoint_test_env_setup: SharepointTestEnvSetupTuple,
+) -> None:
+    """Test that public documents are accessible by both test users using ACL verification"""
+    (
+        admin_user,
+        regular_user_1,
+        regular_user_2,
+        credential,
+        connector,
+        cc_pair,
+    ) = sharepoint_test_env_setup
+
+    with get_session_with_current_tenant() as db_session:
+        # Get all documents for this connector
+        all_document_ids = get_all_connector_documents(cc_pair, db_session)
+
+        # Test that regular_user_1 can access documents
+        accessible_docs_user1 = get_user_document_access_via_acl(
+            test_user=regular_user_1,
+            document_ids=all_document_ids,
+            db_session=db_session,
+        )
+
+        # Test that regular_user_2 can access documents
+        accessible_docs_user2 = get_user_document_access_via_acl(
+            test_user=regular_user_2,
+            document_ids=all_document_ids,
+            db_session=db_session,
+        )
+
+        logger.info(f"User 1 has access to {len(accessible_docs_user1)} documents")
+        logger.info(f"User 2 has access to {len(accessible_docs_user2)} documents")
+
+        # For public documents, both users should have access to at least some docs
+        assert len(accessible_docs_user1) == 8, (
+            f"User 1 should have access to documents. Found "
+            f"{len(accessible_docs_user1)} accessible docs out of "
+            f"{len(all_document_ids)} total"
+        )
+        assert len(accessible_docs_user2) == 1, (
+            f"User 2 should have access to documents. Found "
+            f"{len(accessible_docs_user2)} accessible docs out of "
+            f"{len(all_document_ids)} total"
+        )
+
+        logger.info(
+            "Successfully verified public documents are accessible by users via ACL"
+        )
+
+
+def test_group_based_permissions(
+    sharepoint_test_env_setup: SharepointTestEnvSetupTuple,
+) -> None:
+    """Test that documents with group permissions are accessible only by users in that group using ACL verification"""
+    (
+        admin_user,
+        regular_user_1,
+        regular_user_2,
+        credential,
+        connector,
+        cc_pair,
+    ) = sharepoint_test_env_setup
+
+    with get_session_with_current_tenant() as db_session:
+        # Get all documents for this connector
+        all_document_ids = get_all_connector_documents(cc_pair, db_session)
+
+        if not all_document_ids:
+            pytest.skip("No documents found for connector - skipping test")
+
+        # Test access for both users
+        accessible_docs_user1 = get_user_document_access_via_acl(
+            test_user=regular_user_1,
+            document_ids=all_document_ids,
+            db_session=db_session,
+        )
+
+        accessible_docs_user2 = get_user_document_access_via_acl(
+            test_user=regular_user_2,
+            document_ids=all_document_ids,
+            db_session=db_session,
+        )
+
+        logger.info(f"User 1 has access to {len(accessible_docs_user1)} documents")
+        logger.info(f"User 2 has access to {len(accessible_docs_user2)} documents")
+
+        public_docs = get_documents_by_permission_type(all_document_ids, db_session)
+
+        # Check if user 2 has access to any non-public documents
+        non_public_access_user2 = [
+            doc for doc in accessible_docs_user2 if doc not in public_docs
+        ]
+
+        assert (
+            len(non_public_access_user2) == 0
+        ), f"User 2 should only have access to public documents. Found access to non-public docs: {non_public_access_user2}"
diff --git a/web/src/components/Field.tsx b/web/src/components/Field.tsx
index 5a5bbba12ae..c254c709478 100644
--- a/web/src/components/Field.tsx
+++ b/web/src/components/Field.tsx
@@ -8,6 +8,7 @@ import {
   useField,
   useFormikContext,
 } from "formik";
+import { FileUpload } from "@/components/admin/connectors/FileUpload";
 import * as Yup from "yup";
 import { FormBodyBuilder } from "./admin/connectors/types";
 import { StringOrNumberOption } from "@/components/Dropdown";
@@ -37,6 +38,12 @@ import { transformLinkUri } from "@/lib/utils";
 import FileInput from "@/app/admin/connectors/[connector]/pages/ConnectorInput/FileInput";
 import { DatePicker } from "./ui/datePicker";
 import { Textarea, TextareaProps } from "./ui/textarea";
+import {
+  TypedFile,
+  createTypedFile,
+  getFileTypeDefinitionForField,
+  FILE_TYPE_DEFINITIONS,
+} from "@/lib/connectors/fileTypes";
 
 export function SectionHeader({
   children,
@@ -386,6 +393,120 @@ export function FileUploadFormField({
   );
 }
 
+export function TypedFileUploadFormField({
+  name,
+  label,
+  subtext,
+}: {
+  name: string;
+  label: string;
+  subtext?: string | JSX.Element;
+}) {
+  const [field, , helpers] = useField<TypedFile | null>(name);
+  const [customError, setCustomError] = useState<string>("");
+  const [isValidating, setIsValidating] = useState(false);
+  const [description, setDescription] = useState<string>("");
+
+  useEffect(() => {
+    const typeDefinitionKey = getFileTypeDefinitionForField(name);
+    if (typeDefinitionKey) {
+      setDescription(
+        FILE_TYPE_DEFINITIONS[typeDefinitionKey].description || ""
+      );
+    }
+  }, [name]);
+
+  useEffect(() => {
+    const validateFile = async () => {
+      if (!field.value) {
+        setIsValidating(false);
+        return;
+      }
+
+      setIsValidating(true);
+
+      try {
+        const validation = await field.value.validate();
+        if (validation?.isValid) {
+          setCustomError("");
+        } else {
+          setCustomError(validation?.errors.join(", ") || "Unknown error");
+          helpers.setValue(null);
+        }
+      } catch (error) {
+        setCustomError(
+          error instanceof Error ? error.message : "Validation error"
+        );
+        helpers.setValue(null);
+      } finally {
+        setIsValidating(false);
+      }
+    };
+
+    validateFile();
+  }, [field.value, helpers]);
+
+  const handleFileSelection = async (files: File[]) => {
+    if (files.length === 0) {
+      helpers.setValue(null);
+      setCustomError("");
+      return;
+    }
+
+    const file = files[0];
+    if (!file) {
+      setCustomError("File selection error");
+      return;
+    }
+
+    const typeDefinitionKey = getFileTypeDefinitionForField(name);
+
+    if (!typeDefinitionKey) {
+      setCustomError(`No file type definition found for field: ${name}`);
+      return;
+    }
+
+    try {
+      const typedFile = createTypedFile(file, name, typeDefinitionKey);
+      helpers.setValue(typedFile);
+      setCustomError("");
+    } catch (error) {
+      setCustomError(error instanceof Error ? error.message : "Unknown error");
+      helpers.setValue(null);
+    } finally {
+      setIsValidating(false);
+    }
+  };
+
+  return (
+    <div className="w-full">
+      <FieldLabel name={name} label={label} subtext={subtext} />
+      {description && (
+        <div className="text-sm text-gray-500 mb-2">{description}</div>
+      )}
+      <FileUpload
+        selectedFiles={field.value ? [field.value.file] : []}
+        setSelectedFiles={handleFileSelection}
+        multiple={false}
+      />
+      {/* Validation feedback */}
+      {isValidating && (
+        <div className="text-blue-500 text-sm mt-1">Validating file...</div>
+      )}
+
+      {customError ? (
+        <div className="text-red-500 text-sm mt-1">{customError}</div>
+      ) : (
+        <ErrorMessage
+          name={name}
+          component="div"
+          className="text-red-500 text-sm mt-1"
+        />
+      )}
+    </div>
+  );
+}
+
 export function MultiSelectField({
   name,
   label,
diff --git a/web/src/components/admin/connectors/CredentialForm.tsx b/web/src/components/admin/connectors/CredentialForm.tsx
index 1a7636db4d9..32d174bd758 100644
--- a/web/src/components/admin/connectors/CredentialForm.tsx
+++ b/web/src/components/admin/connectors/CredentialForm.tsx
@@ -4,11 +4,20 @@ import * as Yup from "yup";
 import { Popup } from "./Popup";
 import { ValidSources } from "@/lib/types";
 
-import { createCredential } from "@/lib/credential";
-import { CredentialBase, Credential } from "@/lib/connectors/credentials";
+import {
+  createCredential,
+  createCredentialWithPrivateKey,
+} from "@/lib/credential";
+import {
+  CredentialBase,
+  Credential,
+  CredentialWithPrivateKey,
+} from "@/lib/connectors/credentials";
+
+const PRIVATE_KEY_FIELD_KEY = "private_key";
 
 export async function submitCredential<T>(
-  credential: CredentialBase<T>
+  credential: CredentialBase<T> | CredentialWithPrivateKey<T>
 ): Promise<{
   credential?: Credential<any>;
   message: string;
@@ -16,8 +25,14 @@ export async function submitCredential<T>(
 }> {
   let isSuccess = false;
   try {
-    const response = await createCredential(credential);
-
+    let response: Response;
+    if (PRIVATE_KEY_FIELD_KEY in credential && credential.private_key) {
+      response = await createCredentialWithPrivateKey(
+        credential as CredentialWithPrivateKey<T>
+      );
+    } else {
+      response = await createCredential(credential as CredentialBase<T>);
+    }
     if (response.ok) {
       const parsed_response = await response.json();
       const credential = parsed_response.credential;
diff --git a/web/src/components/credentials/CredentialSection.tsx b/web/src/components/credentials/CredentialSection.tsx
index 7492c14be35..1302bf659e0 100644
--- a/web/src/components/credentials/CredentialSection.tsx
+++ b/web/src/components/credentials/CredentialSection.tsx
@@ -10,6 +10,7 @@ import {
   deleteCredential,
   swapCredential,
   updateCredential,
+  updateCredentialWithPrivateKey,
 } from "@/lib/credential";
 import { usePopup } from "@/components/admin/connectors/Popup";
 import CreateCredential from "./actions/CreateCredential";
@@ -34,6 +35,7 @@ import {
 import { Spinner } from "@/components/Spinner";
 import { CreateStdOAuthCredential } from "@/components/credentials/actions/CreateStdOAuthCredential";
 import { Card } from "../ui/card";
+import { isTypedFileField, TypedFile } from "@/lib/connectors/fileTypes";
 
 export default function CredentialSection({
   ccPair,
@@ -111,7 +113,23 @@ export default function CredentialSection({
     details: any,
     onSucces: () => void
   ) => {
-    const response = await updateCredential(selectedCredential.id, details);
+    let privateKey: TypedFile | null = null;
+    Object.entries(details).forEach(([key, value]) => {
+      if (isTypedFileField(key)) {
+        privateKey = value as TypedFile;
+        delete details[key];
+      }
+    });
+    let response;
+    if (privateKey) {
+      response = await updateCredentialWithPrivateKey(
+        selectedCredential.id,
+        details,
+        privateKey
+      );
+    } else {
+      response = await updateCredential(selectedCredential.id, details);
+    }
     if (response.ok) {
       setPopup({
         message: "Updated credential",
diff --git a/web/src/components/credentials/actions/CreateCredential.tsx b/web/src/components/credentials/actions/CreateCredential.tsx
index 17bfdec7f26..a0356d8c4d1 100644
--- a/web/src/components/credentials/actions/CreateCredential.tsx
+++ b/web/src/components/credentials/actions/CreateCredential.tsx
@@ -23,6 +23,7 @@ import {
 import { useUser } from "@/components/user/UserProvider";
 import CardSection from "@/components/admin/CardSection";
 import { CredentialFieldsRenderer } from "./CredentialFieldsRenderer";
+import { TypedFile } from "@/lib/connectors/fileTypes";
 
 const CreateButton = ({
   onClick,
@@ -114,10 +115,15 @@ export default function CreateCredential({
 
     const { name, is_public, groups, ...credentialValues } = values;
 
+    let privateKey: TypedFile | null = null;
     const filteredCredentialValues = Object.fromEntries(
-      Object.entries(credentialValues).filter(
-        ([_, value]) => value !== null && value !== ""
-      )
+      Object.entries(credentialValues).filter(([key, value]) => {
+        if (value instanceof TypedFile) {
+          privateKey = value;
+          return false;
+        }
+        return value !== null && value !== "";
+      })
     );
 
     try {
@@ -128,6 +134,7 @@ export default function CreateCredential({
         groups: groups,
         name: name,
         source: sourceType,
+        private_key: privateKey || undefined,
       });
 
       const { message, isSuccess, credential } = response;
diff --git a/web/src/components/credentials/actions/CredentialFieldsRenderer.tsx b/web/src/components/credentials/actions/CredentialFieldsRenderer.tsx
index c8337d95c1b..cbe506a4d85 100644
--- a/web/src/components/credentials/actions/CredentialFieldsRenderer.tsx
+++ b/web/src/components/credentials/actions/CredentialFieldsRenderer.tsx
@@ -1,12 +1,17 @@
 import React from "react";
 import { Tabs, TabsContent, TabsList, TabsTrigger } from "@/components/ui/tabs";
 import { useFormikContext } from "formik";
-import { BooleanFormField, TextFormField } from "@/components/Field";
+import {
+  BooleanFormField,
+  TextFormField,
+  TypedFileUploadFormField,
+} from "@/components/Field";
 import {
   getDisplayNameForCredentialKey,
   CredentialTemplateWithAuth,
 } from "@/lib/connectors/credentials";
 import { dictionaryType } from "../types";
+import { isTypedFileField } from "@/lib/connectors/fileTypes";
 
 interface CredentialFieldsRendererProps {
   credentialTemplate: dictionaryType;
@@ -90,6 +95,16 @@ export function CredentialFieldsRenderer({
                 )}
 
               {Object.entries(method.fields).map(([key, val]) => {
+                if (isTypedFileField(key)) {
+                  return (
+                    <TypedFileUploadFormField
+                      key={key}
+                      name={key}
+                      label={getDisplayNameForCredentialKey(key)}
+                    />
+                  );
+                }
+
                 if (typeof val === "boolean") {
                   return (
                     <BooleanFormField
@@ -130,6 +145,15 @@ export function CredentialFieldsRenderer({
         if (key === "authentication_method" || key === "authMethods") {
           return null;
         }
+        if (isTypedFileField(key)) {
+          return (
+            <TypedFileUploadFormField
+              key={key}
+              name={key}
+              label={getDisplayNameForCredentialKey(key)}
+            />
+          );
+        }
 
         if (typeof val === "boolean") {
           return (
@@ -144,7 +168,7 @@ export function CredentialFieldsRenderer({
           <TextFormField
             key={key}
             name={key}
-            placeholder={val}
+            placeholder={val as string}
             label={getDisplayNameForCredentialKey(key)}
             type={
               key.toLowerCase().includes("token") ||
diff --git a/web/src/components/credentials/actions/EditCredential.tsx b/web/src/components/credentials/actions/EditCredential.tsx
index 659a887d701..0f0b802cb37 100644
--- a/web/src/components/credentials/actions/EditCredential.tsx
+++ b/web/src/components/credentials/actions/EditCredential.tsx
@@ -3,7 +3,7 @@ import { Button } from "@/components/ui/button";
 import Text from "@/components/ui/text";
 
 import { FaNewspaper, FaTrash } from "react-icons/fa";
-import { TextFormField } from "@/components/Field";
+import { TextFormField, TypedFileUploadFormField } from "@/components/Field";
 import { Form, Formik, FormikHelpers } from "formik";
 import { PopupSpec } from "@/components/admin/connectors/Popup";
 import {
@@ -12,6 +12,7 @@ import {
 } from "@/lib/connectors/credentials";
 import { createEditingValidationSchema, createInitialValues } from "../lib";
 import { dictionaryType, formType } from "../types";
+import { isTypedFileField } from "@/lib/connectors/fileTypes";
 
 const EditCredential = ({
   credential,
@@ -68,22 +69,30 @@ const EditCredential = ({
               label="Name (optional):"
             />
 
-            {Object.entries(credential.credential_json).map(([key, value]) => (
-              <TextFormField
-                includeRevert
-                key={key}
-                name={key}
-                placeholder={value}
-                label={getDisplayNameForCredentialKey(key)}
-                type={
-                  key.toLowerCase().includes("token") ||
-                  key.toLowerCase().includes("password")
-                    ? "password"
-                    : "text"
-                }
-                disabled={key === "authentication_method"}
-              />
-            ))}
+            {Object.entries(credential.credential_json).map(([key, value]) =>
+              isTypedFileField(key) ? (
+                <TypedFileUploadFormField
+                  key={key}
+                  name={key}
+                  label={getDisplayNameForCredentialKey(key)}
+                />
+              ) : (
+                <TextFormField
+                  includeRevert
+                  key={key}
+                  name={key}
+                  placeholder={value as string}
+                  label={getDisplayNameForCredentialKey(key)}
+                  type={
+                    key.toLowerCase().includes("token") ||
+                    key.toLowerCase().includes("password")
+                      ? "password"
+                      : "text"
+                  }
+                  disabled={key === "authentication_method"}
+                />
+              )
+            )}
             <div className="flex justify-between w-full">
               <Button type="button" onClick={() => resetForm()}>
                 <div className="flex gap-x-2 items-center w-full border-none">
diff --git a/web/src/components/credentials/lib.ts b/web/src/components/credentials/lib.ts
index cb140bdb919..eb3c6f260d1 100644
--- a/web/src/components/credentials/lib.ts
+++ b/web/src/components/credentials/lib.ts
@@ -6,6 +6,7 @@ import {
   getDisplayNameForCredentialKey,
   CredentialTemplateWithAuth,
 } from "@/lib/connectors/credentials";
+import { isTypedFileField } from "@/lib/connectors/fileTypes";
 
 export function createValidationSchema(json_values: Record<string, any>) {
   const schemaFields: Record<string, Yup.AnySchema> = {};
@@ -16,7 +17,6 @@ export function createValidationSchema(json_values: Record<string, any>) {
     schemaFields["authentication_method"] = Yup.string().required(
       "Please select an authentication method"
     );
-
     // conditional rules per authMethod
     template.authMethods.forEach((method) => {
       Object.entries(method.fields).forEach(([key, def]) => {
@@ -26,6 +26,14 @@ export function createValidationSchema(json_values: Record<string, any>) {
             .nullable()
             .default(false)
             .transform((v, o) => (o === undefined ? false : v));
+        } else if (isTypedFileField(key)) {
+          //TypedFile fields - use mixed schema instead of string (check before null check)
+          schemaFields[key] = Yup.mixed().when("authentication_method", {
+            is: method.value,
+            then: () =>
+              Yup.mixed().required(`Please select a ${displayName} file`),
+            otherwise: () => Yup.mixed().notRequired(),
+          });
         } else if (def === null) {
           schemaFields[key] = Yup.string()
             .trim()
@@ -58,6 +66,11 @@ export function createValidationSchema(json_values: Record<string, any>) {
         .nullable()
         .default(false)
         .transform((v, o) => (o === undefined ? false : v));
+    } else if (isTypedFileField(key)) {
+      // TypedFile fields - use mixed schema instead of string (check before null check)
+      schemaFields[key] = Yup.mixed().required(
+        `Please select a ${displayName} file`
+      );
     } else if (def === null) {
       schemaFields[key] = Yup.string()
         .trim()
@@ -77,11 +90,16 @@ export function createValidationSchema(json_values: Record<string, any>) {
 }
 
 export function createEditingValidationSchema(json_values: dictionaryType) {
-  const schemaFields: { [key: string]: Yup.StringSchema } = {};
+  const schemaFields: { [key: string]: Yup.AnySchema } = {};
 
   for (const key in json_values) {
     if (Object.prototype.hasOwnProperty.call(json_values, key)) {
-      schemaFields[key] = Yup.string().optional();
+      if (isTypedFileField(key)) {
+        // TypedFile fields - use mixed schema for optional file uploads during editing
+        schemaFields[key] = Yup.mixed().optional();
+      } else {
+        schemaFields[key] = Yup.string().optional();
+      }
     }
   }
 
@@ -95,7 +113,12 @@ export function createInitialValues(credential: Credential<any>): formType {
   };
 
   for (const key in credential.credential_json) {
-    initialValues[key] = "";
+    // Initialize TypedFile fields as null, other fields as empty strings
+    if (isTypedFileField(key)) {
+      initialValues[key] = null as any; // TypedFile fields start as null
+    } else {
+      initialValues[key] = "";
+    }
   }
 
   return initialValues;
diff --git a/web/src/components/credentials/types.ts b/web/src/components/credentials/types.ts
index 17aee20c919..1ebd9d50952 100644
--- a/web/src/components/credentials/types.ts
+++ b/web/src/components/credentials/types.ts
@@ -1,5 +1,7 @@
+import { TypedFile } from "@/lib/connectors/fileTypes";
+
 export interface dictionaryType {
-  [key: string]: string;
+  [key: string]: string | TypedFile;
 }
 export interface formType extends dictionaryType {
   name: string;
diff --git a/web/src/lib/connectors/AutoSyncOptionFields.tsx b/web/src/lib/connectors/AutoSyncOptionFields.tsx
index 40e6cc4082a..41930e97db9 100644
--- a/web/src/lib/connectors/AutoSyncOptionFields.tsx
+++ b/web/src/lib/connectors/AutoSyncOptionFields.tsx
@@ -17,4 +17,5 @@ export const autoSyncConfigBySource: Record<
   github: {},
   slack: {},
   salesforce: {},
+  sharepoint: {},
 };
diff --git a/web/src/lib/connectors/credentials.ts b/web/src/lib/connectors/credentials.ts
index c4f0371e771..b88ea42d555 100644
--- a/web/src/lib/connectors/credentials.ts
+++ b/web/src/lib/connectors/credentials.ts
@@ -1,4 +1,5 @@
 import { ValidSources } from "../types";
+import { TypedFile } from "./fileTypes";
 
 export interface OAuthAdditionalKwargDescription {
   name: string;
@@ -30,6 +31,10 @@ export interface CredentialBase<T> {
   groups?: number[];
 }
 
+export interface CredentialWithPrivateKey<T> extends CredentialBase<T> {
+  private_key: TypedFile;
+}
+
 export interface Credential<T> extends CredentialBase<T> {
   id: number;
   user_id: string | null;
@@ -188,8 +193,10 @@ export interface SalesforceCredentialJson {
 
 export interface SharepointCredentialJson {
   sp_client_id: string;
-  sp_client_secret: string;
+  sp_client_secret?: string;
   sp_directory_id: string;
+  sp_certificate_password?: string;
+  sp_private_key?: TypedFile;
 }
 
 export interface AsanaCredentialJson {
@@ -297,10 +304,33 @@ export const credentialTemplates: Record<ValidSources, any> = {
     is_sandbox: false,
   } as SalesforceCredentialJson,
   sharepoint: {
-    sp_client_id: "",
-    sp_client_secret: "",
-    sp_directory_id: "",
-  } as SharepointCredentialJson,
+    authentication_method: "client_credentials",
+    authMethods: [
+      {
+        value: "client_secret",
+        label: "Client Secret",
+        fields: {
+          sp_client_id: "",
+          sp_client_secret: "",
+          sp_directory_id: "",
+        },
+        description:
+          "If you select this mode, the SharePoint connector will use a client secret to authenticate. You will need to provide the client ID and client secret.",
+      },
+      {
+        value: "certificate",
+        label: "Certificate Authentication",
+        fields: {
+          sp_client_id: "",
+          sp_directory_id: "",
+          sp_certificate_password: "",
+          sp_private_key: null,
+        },
+        description:
+          "If you select this mode, the SharePoint connector will use a certificate to authenticate. You will need to provide the client ID, directory ID, certificate password, and PFX data.",
+      },
+    ],
+  } as CredentialTemplateWithAuth<SharepointCredentialJson>,
   asana: {
     asana_api_token_secret: "",
   } as AsanaCredentialJson,
@@ -522,6 +552,8 @@ export const credentialDisplayNames: Record<string, string> = {
   sp_client_id: "SharePoint Client ID",
   sp_client_secret: "SharePoint Client Secret",
   sp_directory_id: "SharePoint Directory ID",
+  sp_certificate_password: "SharePoint Certificate Password",
+  sp_private_key: "SharePoint Private Key",
 
   // Asana
   asana_api_token_secret: "Asana API Token",
diff --git a/web/src/lib/connectors/fileTypes.ts b/web/src/lib/connectors/fileTypes.ts
new file mode 100644
index 00000000000..b6592637e5f
--- /dev/null
+++ b/web/src/lib/connectors/fileTypes.ts
@@ -0,0 +1,117 @@
+export enum FileTypeCategory {
+  SHAREPOINT_PFX_FILE = "sharepoint_pfx_file",
+}
+
+export interface FileValidationRule {
+  maxSizeKB?: number;
+  allowedExtensions?: string[];
+  contentValidation?: (file: File) => Promise<boolean>;
+}
+
+export interface FileTypeDefinition {
+  category: FileTypeCategory;
+  validation?: FileValidationRule;
+  description?: string;
+}
+
+export const FILE_TYPE_DEFINITIONS: Record<
+  FileTypeCategory,
+  FileTypeDefinition
+> = {
+  [FileTypeCategory.SHAREPOINT_PFX_FILE]: {
+    category: FileTypeCategory.SHAREPOINT_PFX_FILE,
+    validation: {
+      maxSizeKB: 10,
+      allowedExtensions: [".pfx"],
+    },
+    description:
+      "Please upload a .pfx file containing the private key for SharePoint. The file size must be under 10KB.",
+  },
+};
+
+export class TypedFile {
+  constructor(
+    public readonly file: File,
+    public readonly typeDefinition: FileTypeDefinition,
+    public readonly fieldKey: string
+  ) {}
+
+  async validate(): Promise<{ isValid: boolean; errors: string[] }> {
+    const errors: string[] = [];
+    const { validation } = this.typeDefinition;
+
+    if (!validation) {
+      return {
+        isValid: true,
+        errors: [],
+      };
+    }
+
+    // Size validation
+    if (validation.maxSizeKB && this.file.size > validation.maxSizeKB * 1024) {
+      errors.push(`File size must not exceed ${validation.maxSizeKB}KB`);
+    }
+
+    // Extension validation
+    if (validation.allowedExtensions) {
+      const extension = this.file.name.toLowerCase().split(".").pop();
+      if (
+        !extension ||
+        !validation.allowedExtensions.includes(`.${extension}`)
+      ) {
+        errors.push(
+          `File must have one of these extensions: ${validation.allowedExtensions.join(", ")}`
+        );
+      }
+    }
+
+    // Content validation
+    if (validation.contentValidation) {
+      try {
+        const isContentValid = await validation.contentValidation(this.file);
+        if (!isContentValid) {
+          errors.push(`File content validation failed`);
+        }
+      } catch (error) {
+        errors.push(
+          `Content validation error: ${error instanceof Error ? error.message : "Unknown error"}`
+        );
+      }
+    }
+
+    return {
+      isValid: errors.length === 0,
+      errors,
+    };
+  }
+}
+
+export function createTypedFile(
+  file: File,
+  fieldKey: string,
+  typeDefinitionKey: FileTypeCategory
+): TypedFile {
+  const typeDefinition = FILE_TYPE_DEFINITIONS[typeDefinitionKey];
+  if (!typeDefinition) {
+    throw new Error(`Unknown file type definition: ${typeDefinitionKey}`);
+  }
+
+  return new TypedFile(file, typeDefinition, fieldKey);
+}
+
+export function isTypedFileField(fieldKey: string): boolean {
+  // Define which fields should be typed files
+  const typedFileFields = new Set(["sp_private_key"]);
+  return typedFileFields.has(fieldKey);
+}
+
+// Get the appropriate file type definition for a field
+export function getFileTypeDefinitionForField(
+  fieldKey: string
+): FileTypeCategory | null {
+  const fieldToTypeMap: Record<string, FileTypeCategory> = {
+    sp_private_key: FileTypeCategory.SHAREPOINT_PFX_FILE,
+  };
+
+  return fieldToTypeMap[fieldKey] || null;
+}
diff --git a/web/src/lib/constants.ts b/web/src/lib/constants.ts
index ed7f57b5a32..6079ee188f3 100644
--- a/web/src/lib/constants.ts
+++ b/web/src/lib/constants.ts
@@ -108,3 +108,11 @@ export const ALLOWED_URL_PROTOCOLS = [
 ];
 
 export const MAX_CHARACTERS_PERSONA_DESCRIPTION = 5000000;
+
+//Credential form data key constants
+export const CREDENTIAL_NAME = "name";
+export const CREDENTIAL_SOURCE = "source";
+export const CREDENTIAL_UPLOADED_FILE = "uploaded_file";
+export const CREDENTIAL_FIELD_KEY = "field_key";
+export const CREDENTIAL_TYPE_DEFINITION_KEY = "type_definition_key";
+export const CREDENTIAL_JSON = "credential_json";
diff --git a/web/src/lib/credential.ts b/web/src/lib/credential.ts
index b71e4012e34..4adf77105f4 100644
--- a/web/src/lib/credential.ts
+++ b/web/src/lib/credential.ts
@@ -1,5 +1,17 @@
-import { CredentialBase } from "./connectors/credentials";
+import {
+  CredentialBase,
+  CredentialWithPrivateKey,
+} from "./connectors/credentials";
 import { AccessType } from "@/lib/types";
+import { TypedFile } from "./connectors/fileTypes";
+import {
+  CREDENTIAL_NAME,
+  CREDENTIAL_SOURCE,
+  CREDENTIAL_UPLOADED_FILE,
+  CREDENTIAL_FIELD_KEY,
+  CREDENTIAL_TYPE_DEFINITION_KEY,
+  CREDENTIAL_JSON,
+} from "./constants";
 
 export async function createCredential(credential: CredentialBase<any>) {
   return await fetch(`/api/manage/credential`, {
@@ -11,6 +23,37 @@ export async function createCredential(credential: CredentialBase<any>) {
   });
 }
 
+export async function createCredentialWithPrivateKey(
+  credential: CredentialWithPrivateKey<any>
+) {
+  const formData = new FormData();
+  formData.append(CREDENTIAL_JSON, JSON.stringify(credential.credential_json));
+  formData.append("admin_public", credential.admin_public.toString());
+  formData.append(
+    "curator_public",
+    credential.curator_public?.toString() || "false"
+  );
+  if (credential.groups && credential.groups.length > 0) {
+    credential.groups.forEach((group) => {
+      formData.append("groups", String(group));
+    });
+  }
+  formData.append(CREDENTIAL_NAME, credential.name || "");
+  formData.append(CREDENTIAL_SOURCE, credential.source);
+  if (credential.private_key) {
+    formData.append(CREDENTIAL_UPLOADED_FILE, credential.private_key.file);
+    formData.append(CREDENTIAL_FIELD_KEY, credential.private_key.fieldKey);
+    formData.append(
+      CREDENTIAL_TYPE_DEFINITION_KEY,
+      credential.private_key.typeDefinition.category
+    );
+  }
+  return await fetch(`/api/manage/credential/private-key`, {
+    method: "POST",
+    body: formData,
+  });
+}
+
 export async function adminDeleteCredential<T>(credentialId: number) {
   return await fetch(`/api/manage/admin/credential/${credentialId}`, {
     method: "DELETE",
@@ -70,7 +113,7 @@ export function updateCredential(credentialId: number, newDetails: any) {
   const name = newDetails.name;
   const details = Object.fromEntries(
     Object.entries(newDetails).filter(
-      ([key, value]) => key !== "name" && value !== ""
+      ([key, value]) => key !== CREDENTIAL_NAME && value !== ""
     )
   );
   return fetch(`/api/manage/admin/credential/${credentialId}`, {
@@ -85,6 +128,32 @@ export function updateCredential(credentialId: number, newDetails: any) {
   });
 }
 
+export function updateCredentialWithPrivateKey(
+  credentialId: number,
+  newDetails: any,
+  privateKey: TypedFile
+) {
+  const name = newDetails.name;
+  const details = Object.fromEntries(
+    Object.entries(newDetails).filter(
+      ([key, value]) => key !== CREDENTIAL_NAME && value !== ""
+    )
+  );
+  const formData = new FormData();
+  formData.append(CREDENTIAL_NAME, name);
+  formData.append(CREDENTIAL_JSON, JSON.stringify(details));
+  formData.append(CREDENTIAL_UPLOADED_FILE, privateKey.file);
+  formData.append(CREDENTIAL_FIELD_KEY, privateKey.fieldKey);
+  formData.append(
+    CREDENTIAL_TYPE_DEFINITION_KEY,
+    privateKey.typeDefinition.category
+  );
+  return fetch(`/api/manage/admin/credential/private-key/${credentialId}`, {
+    method: "PUT",
+    body: formData,
+  });
+}
+
 export function swapCredential(
   newCredentialId: number,
   connectorId: number,
diff --git a/web/src/lib/types.ts b/web/src/lib/types.ts
index 22b6c8eb966..67877913e2b 100644
--- a/web/src/lib/types.ts
+++ b/web/src/lib/types.ts
@@ -456,6 +456,7 @@ export const validAutoSyncSources = [
   ValidSources.Slack,
   ValidSources.Salesforce,
   ValidSources.GitHub,
+  ValidSources.Sharepoint,
 ] as const;
 
 // Create a type from the array elements

From d3367b3324e65c3c318dcbb3c9f7faa645439d85 Mon Sep 17 00:00:00 2001
From: Wenxi Onyx <wenxi@onyx.app>
Date: Thu, 17 Jul 2025 16:20:50 -0700
Subject: [PATCH 77/78] clean up connector page and add new option for uncommon
 connectors

---
 backend/onyx/server/settings/models.py                       | 2 +-
 deployment/docker_compose/docker-compose.dev.yml             | 1 +
 deployment/docker_compose/docker-compose.gpu-dev.yml         | 1 +
 deployment/docker_compose/docker-compose.multitenant-dev.yml | 1 +
 deployment/docker_compose/env.prod.template                  | 4 +++-
 web/src/app/admin/add-connector/page.tsx                     | 1 +
 6 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/backend/onyx/server/settings/models.py b/backend/onyx/server/settings/models.py
index 450e321afc5..90a1f7a7143 100644
--- a/backend/onyx/server/settings/models.py
+++ b/backend/onyx/server/settings/models.py
@@ -63,7 +63,7 @@ class Settings(BaseModel):
     user_knowledge_enabled: bool | None = True
 
     # Connector settings
-    show_extra_connectors: bool | None = False
+    show_extra_connectors: bool | None = True
 
 
 class UserSettings(Settings):
diff --git a/deployment/docker_compose/docker-compose.dev.yml b/deployment/docker_compose/docker-compose.dev.yml
index ed4ca261d49..cb5c2d41c55 100644
--- a/deployment/docker_compose/docker-compose.dev.yml
+++ b/deployment/docker_compose/docker-compose.dev.yml
@@ -61,6 +61,7 @@ services:
       - LANGUAGE_HINT=${LANGUAGE_HINT:-}
       - LANGUAGE_CHAT_NAMING_HINT=${LANGUAGE_CHAT_NAMING_HINT:-}
       - QA_PROMPT_OVERRIDE=${QA_PROMPT_OVERRIDE:-}
+      - SHOW_EXTRA_CONNECTORS=${SHOW_EXTRA_CONNECTORS:-}
       - USE_SEMANTIC_KEYWORD_EXPANSIONS_BASIC_SEARCH=${USE_SEMANTIC_KEYWORD_EXPANSIONS_BASIC_SEARCH:-}
       # Other services
       - POSTGRES_HOST=relational_db
diff --git a/deployment/docker_compose/docker-compose.gpu-dev.yml b/deployment/docker_compose/docker-compose.gpu-dev.yml
index 6a98bcb8818..0933994b840 100644
--- a/deployment/docker_compose/docker-compose.gpu-dev.yml
+++ b/deployment/docker_compose/docker-compose.gpu-dev.yml
@@ -55,6 +55,7 @@ services:
       - LANGUAGE_HINT=${LANGUAGE_HINT:-}
       - LANGUAGE_CHAT_NAMING_HINT=${LANGUAGE_CHAT_NAMING_HINT:-}
       - QA_PROMPT_OVERRIDE=${QA_PROMPT_OVERRIDE:-}
+      - SHOW_EXTRA_CONNECTORS=${SHOW_EXTRA_CONNECTORS:-}
       # Other services
       - POSTGRES_HOST=relational_db
       - VESPA_HOST=index
diff --git a/deployment/docker_compose/docker-compose.multitenant-dev.yml b/deployment/docker_compose/docker-compose.multitenant-dev.yml
index 53084063f4c..248ddbe8904 100644
--- a/deployment/docker_compose/docker-compose.multitenant-dev.yml
+++ b/deployment/docker_compose/docker-compose.multitenant-dev.yml
@@ -66,6 +66,7 @@ services:
       - LANGUAGE_HINT=${LANGUAGE_HINT:-}
       - LANGUAGE_CHAT_NAMING_HINT=${LANGUAGE_CHAT_NAMING_HINT:-}
       - QA_PROMPT_OVERRIDE=${QA_PROMPT_OVERRIDE:-}
+      - SHOW_EXTRA_CONNECTORS=${SHOW_EXTRA_CONNECTORS:-}
       # Other services
       - POSTGRES_HOST=relational_db
       - POSTGRES_DEFAULT_SCHEMA=${POSTGRES_DEFAULT_SCHEMA:-}
diff --git a/deployment/docker_compose/env.prod.template b/deployment/docker_compose/env.prod.template
index f8800173aec..27a46d25227 100644
--- a/deployment/docker_compose/env.prod.template
+++ b/deployment/docker_compose/env.prod.template
@@ -8,7 +8,9 @@ WEB_DOMAIN=http://localhost:3000
 
 
 # NOTE: Generative AI configurations are done via the UI now
-EXA_API_KEY=
+
+# Internet search API key (set this to see the Internet Search Tool in the admin config)
+# EXA_API_KEY=
 
 # The following are for configuring User Authentication, supported flows are:
 # disabled
diff --git a/web/src/app/admin/add-connector/page.tsx b/web/src/app/admin/add-connector/page.tsx
index 0ee5fc9b08e..6ee4c52e8d3 100644
--- a/web/src/app/admin/add-connector/page.tsx
+++ b/web/src/app/admin/add-connector/page.tsx
@@ -30,6 +30,7 @@ import useSWR from "swr";
 import { errorHandlingFetcher } from "@/lib/fetcher";
 import { buildSimilarCredentialInfoURL } from "@/app/admin/connector/[ccPairId]/lib";
 import { Credential } from "@/lib/connectors/credentials";
+import { SettingsContext } from "@/components/settings/SettingsProvider";
 import SourceTile from "@/components/SourceTile";
 import { SettingsContext } from "@/components/settings/SettingsProvider";
 

From 15bca34751c9e2c77a0da69cefe9f987871f6d11 Mon Sep 17 00:00:00 2001
From: Wenxi Onyx <wenxi@onyx.app>
Date: Thu, 17 Jul 2025 16:56:52 -0700
Subject: [PATCH 78/78] deployment fix and change default to false

---
 backend/onyx/server/settings/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/onyx/server/settings/models.py b/backend/onyx/server/settings/models.py
index 90a1f7a7143..450e321afc5 100644
--- a/backend/onyx/server/settings/models.py
+++ b/backend/onyx/server/settings/models.py
@@ -63,7 +63,7 @@ class Settings(BaseModel):
     user_knowledge_enabled: bool | None = True
 
     # Connector settings
-    show_extra_connectors: bool | None = True
+    show_extra_connectors: bool | None = False
 
 
 class UserSettings(Settings):