From cf7efa130fc1b31055f7c8f18d65936961a72753 Mon Sep 17 00:00:00 2001
From: Elijah Williams <ewilliams@cloudera.com>
Date: Fri, 2 May 2025 16:07:30 -0600
Subject: [PATCH 01/41] wip on simple streaming

---
 .../app/routers/index/sessions/__init__.py    | 29 +++++++++++++
 llm-service/app/services/llm_completion.py    | 42 ++++++++++++++++++-
 2 files changed, 70 insertions(+), 1 deletion(-)

diff --git a/llm-service/app/routers/index/sessions/__init__.py b/llm-service/app/routers/index/sessions/__init__.py
index 83a38de1..110d3e86 100644
--- a/llm-service/app/routers/index/sessions/__init__.py
+++ b/llm-service/app/routers/index/sessions/__init__.py
@@ -41,10 +41,12 @@
 from typing import Optional
 
 from fastapi import APIRouter, Header
+from fastapi.responses import StreamingResponse
 from pydantic import BaseModel
 
 from .... import exceptions
 from ....rag_types import RagPredictConfiguration
+from ....services import models
 from ....services.chat import (
     v2_chat,
 )
@@ -53,6 +55,7 @@
     chat_history_manager,
 )
 from ....services.chat_history.paginator import paginate
+from ....services.llm_completion import stream_completion
 from ....services.metadata_apis import session_metadata_api
 from ....services.mlflow import rating_mlflow_log_metric, feedback_mlflow_log_table
 from ....services.session import rename_session
@@ -161,6 +164,10 @@ class RagStudioChatRequest(BaseModel):
     configuration: RagPredictConfiguration | None = None
 
 
+class StreamCompletionRequest(BaseModel):
+    query: str
+
+
 def parse_jwt_cookie(jwt_cookie: str | None) -> str:
     if jwt_cookie is None:
         return "unknown"
@@ -188,3 +195,25 @@ def chat(
 
     configuration = request.configuration or RagPredictConfiguration()
     return v2_chat(session, request.query, configuration, user_name=origin_remote_user)
+
+
+@router.post(
+    "/stream-completion", summary="Stream completion responses for the given query"
+)
+@exceptions.propagates
+def stream_chat_completion(
+    session_id: int,
+    request: StreamCompletionRequest,
+    origin_remote_user: Optional[str] = Header(None),
+):
+    session = session_metadata_api.get_session(session_id, user_name=origin_remote_user)
+    model_name = session.inference_model
+
+    def generate_stream():
+        for response in stream_completion(session_id, request.query, model_name):
+            yield json.dumps(
+                {"text": response.message.content, "done": response.delta is None}
+            ) + "\n"
+
+    # todo: write to history, start evals, rewrite question, log to mlfow once the response is done
+    return StreamingResponse(generate_stream(), media_type="application/x-ndjson")
diff --git a/llm-service/app/services/llm_completion.py b/llm-service/app/services/llm_completion.py
index 34541ee9..918ca2f9 100644
--- a/llm-service/app/services/llm_completion.py
+++ b/llm-service/app/services/llm_completion.py
@@ -36,8 +36,15 @@
 #  DATA.
 #
 import itertools
+from typing import Generator
 
-from llama_index.core.base.llms.types import ChatMessage, ChatResponse
+from llama_index.core.base.llms.types import (
+    ChatMessage,
+    ChatResponse,
+    ChatResponseGen,
+    CompletionResponse,
+    CompletionResponseGen,
+)
 from llama_index.core.llms import LLM
 
 from . import models
@@ -66,6 +73,24 @@ def completion(session_id: int, question: str, model_name: str) -> ChatResponse:
     return model.chat(messages)
 
 
+def stream_completion(
+    session_id: int, question: str, model_name: str
+) -> ChatResponseGen:
+    """
+    Streamed version of the completion function.
+    Returns a generator that yields ChatResponse objects as they become available.
+    """
+    model = models.LLM.get(model_name)
+    chat_history = chat_history_manager.retrieve_chat_history(session_id)[:10]
+    messages = list(
+        itertools.chain.from_iterable(
+            map(lambda x: make_chat_messages(x), chat_history)
+        )
+    )
+    messages.append(ChatMessage.from_str(question, role="user"))
+    return model.stream_chat(messages)
+
+
 def hypothetical(question: str, configuration: QueryConfiguration) -> str:
     model: LLM = models.LLM.get(configuration.model_name)
     prompt: str = (
@@ -73,3 +98,18 @@ def hypothetical(question: str, configuration: QueryConfiguration) -> str:
         "Produce a brief document that would hypothetically answer this question."
     )
     return model.complete(prompt).text
+
+
+def stream_hypothetical(
+    question: str, configuration: QueryConfiguration
+) -> CompletionResponseGen:
+    """
+    Streamed version of the hypothetical function.
+    Returns a generator that yields CompletionResponse objects as they become available.
+    """
+    model: LLM = models.LLM.get(configuration.model_name)
+    prompt: str = (
+        f"You are an expert. You are asked: {question}. "
+        "Produce a brief document that would hypothetically answer this question."
+    )
+    return model.stream_complete(prompt)

From 46da524da890a5307c9225a269c2991a1e56390e Mon Sep 17 00:00:00 2001
From: Elijah Williams <ewilliams@cloudera.com>
Date: Mon, 5 May 2025 09:17:33 -0600
Subject: [PATCH 02/41] simple poc for streaming

---
 ui/src/api/chatApi.ts                         | 42 +++++++++++++++++++
 ui/src/api/utils.ts                           |  1 +
 .../FooterComponents/RagChatQueryInput.tsx    | 14 ++++++-
 3 files changed, 56 insertions(+), 1 deletion(-)

diff --git a/ui/src/api/chatApi.ts b/ui/src/api/chatApi.ts
index fdb9abb1..2123fc0b 100644
--- a/ui/src/api/chatApi.ts
+++ b/ui/src/api/chatApi.ts
@@ -384,3 +384,45 @@ const feedbackMutation = async ({
     { feedback },
   );
 };
+
+interface ChatRequest {
+  query: string;
+  sessionId: string;
+}
+
+interface ChatMutationOptions {
+  onChunk?: (chunk: string) => void;
+}
+
+export function useStreamChatMutation(options?: ChatMutationOptions) {
+  return useMutation({
+    mutationKey: [MutationKeys.streamChatMutation],
+    mutationFn: async ({ query, sessionId }: ChatRequest) => {
+      const res = await fetch(
+        `${llmServicePath}/sessions/${sessionId}/stream-completion`,
+        {
+          method: "POST",
+          body: JSON.stringify({ query }),
+          headers: { "Content-Type": "application/json" },
+        },
+      );
+      if (!res.body) throw new Error("Error getting stream completion");
+      const reader = res.body.getReader();
+      const decoder = new TextDecoder();
+      let fullResponse = "";
+      let done = false;
+
+      while (!done) {
+        const { value, done: doneReading } = await reader.read();
+        done = doneReading;
+        // do we need the fallback?
+        const chunk = decoder.decode(value ?? new Uint8Array(), {
+          stream: true,
+        });
+        fullResponse += chunk;
+        options?.onChunk?.(chunk);
+      }
+      return fullResponse;
+    },
+  });
+}
diff --git a/ui/src/api/utils.ts b/ui/src/api/utils.ts
index a6c06b5a..ccdce209 100644
--- a/ui/src/api/utils.ts
+++ b/ui/src/api/utils.ts
@@ -80,6 +80,7 @@ export enum MutationKeys {
   "removeDataSourceFromProject" = "removeDataSourceFromProject",
   "updateAmpConfig" = "updateAmpConfig",
   "restartApplication" = "restartApplication",
+  "streamChatMutation" = "streamChatMutation",
 }
 
 export enum QueryKeys {
diff --git a/ui/src/pages/RagChatTab/FooterComponents/RagChatQueryInput.tsx b/ui/src/pages/RagChatTab/FooterComponents/RagChatQueryInput.tsx
index 51cc4a43..8c7959c6 100644
--- a/ui/src/pages/RagChatTab/FooterComponents/RagChatQueryInput.tsx
+++ b/ui/src/pages/RagChatTab/FooterComponents/RagChatQueryInput.tsx
@@ -40,7 +40,11 @@ import { Button, Flex, Input, InputRef, Switch, Tooltip } from "antd";
 import { DatabaseFilled, SendOutlined } from "@ant-design/icons";
 import { useContext, useEffect, useRef, useState } from "react";
 import { RagChatContext } from "pages/RagChatTab/State/RagChatContext.tsx";
-import { createQueryConfiguration, useChatMutation } from "src/api/chatApi.ts";
+import {
+  createQueryConfiguration,
+  useChatMutation,
+  useStreamChatMutation,
+} from "src/api/chatApi.ts";
 import { useParams, useSearch } from "@tanstack/react-router";
 import { cdlBlue600 } from "src/cuix/variables.ts";
 
@@ -62,6 +66,7 @@ const RagChatQueryInput = ({
 
   const [userInput, setUserInput] = useState("");
   const { sessionId } = useParams({ strict: false });
+  const [, setResponse] = useState("");
   const search: { question?: string } = useSearch({
     strict: false,
   });
@@ -85,6 +90,12 @@ const RagChatQueryInput = ({
     },
   });
 
+  const streamChatMutation = useStreamChatMutation({
+    onChunk: (chunk) => {
+      setResponse((prev) => prev + chunk);
+    },
+  });
+
   useEffect(() => {
     if (inputRef.current) {
       inputRef.current.focus();
@@ -97,6 +108,7 @@ const RagChatQueryInput = ({
     }
     if (userInput.length > 0) {
       if (sessionId) {
+        streamChatMutation.mutate({ query: userInput, sessionId: sessionId });
         chatMutation.mutate({
           query: userInput,
           session_id: +sessionId,

From 52904cb1bef0e09f4ec591244ee727374d233a4f Mon Sep 17 00:00:00 2001
From: Elijah Williams <ewilliams@cloudera.com>
Date: Mon, 5 May 2025 09:18:22 -0600
Subject: [PATCH 03/41] remove usage from RagChatQueryInput.tsx

---
 .../FooterComponents/RagChatQueryInput.tsx         | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/ui/src/pages/RagChatTab/FooterComponents/RagChatQueryInput.tsx b/ui/src/pages/RagChatTab/FooterComponents/RagChatQueryInput.tsx
index 8c7959c6..51cc4a43 100644
--- a/ui/src/pages/RagChatTab/FooterComponents/RagChatQueryInput.tsx
+++ b/ui/src/pages/RagChatTab/FooterComponents/RagChatQueryInput.tsx
@@ -40,11 +40,7 @@ import { Button, Flex, Input, InputRef, Switch, Tooltip } from "antd";
 import { DatabaseFilled, SendOutlined } from "@ant-design/icons";
 import { useContext, useEffect, useRef, useState } from "react";
 import { RagChatContext } from "pages/RagChatTab/State/RagChatContext.tsx";
-import {
-  createQueryConfiguration,
-  useChatMutation,
-  useStreamChatMutation,
-} from "src/api/chatApi.ts";
+import { createQueryConfiguration, useChatMutation } from "src/api/chatApi.ts";
 import { useParams, useSearch } from "@tanstack/react-router";
 import { cdlBlue600 } from "src/cuix/variables.ts";
 
@@ -66,7 +62,6 @@ const RagChatQueryInput = ({
 
   const [userInput, setUserInput] = useState("");
   const { sessionId } = useParams({ strict: false });
-  const [, setResponse] = useState("");
   const search: { question?: string } = useSearch({
     strict: false,
   });
@@ -90,12 +85,6 @@ const RagChatQueryInput = ({
     },
   });
 
-  const streamChatMutation = useStreamChatMutation({
-    onChunk: (chunk) => {
-      setResponse((prev) => prev + chunk);
-    },
-  });
-
   useEffect(() => {
     if (inputRef.current) {
       inputRef.current.focus();
@@ -108,7 +97,6 @@ const RagChatQueryInput = ({
     }
     if (userInput.length > 0) {
       if (sessionId) {
-        streamChatMutation.mutate({ query: userInput, sessionId: sessionId });
         chatMutation.mutate({
           query: userInput,
           session_id: +sessionId,

From 5ff5da80c23a4b5b9b8afd5714505251c5ee956f Mon Sep 17 00:00:00 2001
From: Elijah Williams <ewilliams@cloudera.com>
Date: Mon, 5 May 2025 09:22:53 -0600
Subject: [PATCH 04/41] remove stream hypothetical

---
 llm-service/app/services/llm_completion.py | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/llm-service/app/services/llm_completion.py b/llm-service/app/services/llm_completion.py
index 918ca2f9..787469cb 100644
--- a/llm-service/app/services/llm_completion.py
+++ b/llm-service/app/services/llm_completion.py
@@ -36,14 +36,11 @@
 #  DATA.
 #
 import itertools
-from typing import Generator
 
 from llama_index.core.base.llms.types import (
     ChatMessage,
     ChatResponse,
     ChatResponseGen,
-    CompletionResponse,
-    CompletionResponseGen,
 )
 from llama_index.core.llms import LLM
 
@@ -98,18 +95,3 @@ def hypothetical(question: str, configuration: QueryConfiguration) -> str:
         "Produce a brief document that would hypothetically answer this question."
     )
     return model.complete(prompt).text
-
-
-def stream_hypothetical(
-    question: str, configuration: QueryConfiguration
-) -> CompletionResponseGen:
-    """
-    Streamed version of the hypothetical function.
-    Returns a generator that yields CompletionResponse objects as they become available.
-    """
-    model: LLM = models.LLM.get(configuration.model_name)
-    prompt: str = (
-        f"You are an expert. You are asked: {question}. "
-        "Produce a brief document that would hypothetically answer this question."
-    )
-    return model.stream_complete(prompt)

From 96be99aa710e736ea7434964dc0f6ad27c990b3a Mon Sep 17 00:00:00 2001
From: Elijah Williams <ewilliams@cloudera.com>
Date: Mon, 5 May 2025 09:29:16 -0600
Subject: [PATCH 05/41] remove unused import

---
 llm-service/app/routers/index/sessions/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llm-service/app/routers/index/sessions/__init__.py b/llm-service/app/routers/index/sessions/__init__.py
index 110d3e86..01563e45 100644
--- a/llm-service/app/routers/index/sessions/__init__.py
+++ b/llm-service/app/routers/index/sessions/__init__.py
@@ -46,7 +46,6 @@
 
 from .... import exceptions
 from ....rag_types import RagPredictConfiguration
-from ....services import models
 from ....services.chat import (
     v2_chat,
 )

From 4e8e6f005a53bec025f994fb0f9dddbea589373e Mon Sep 17 00:00:00 2001
From: Elijah Williams <ewilliams@cloudera.com>
Date: Mon, 5 May 2025 12:35:12 -0600
Subject: [PATCH 06/41] wip on doing something once the gen is done

---
 llm-service/app/routers/index/sessions/__init__.py | 8 +++++++-
 llm-service/app/services/llm_completion.py         | 7 +++++--
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/llm-service/app/routers/index/sessions/__init__.py b/llm-service/app/routers/index/sessions/__init__.py
index 01563e45..d6127b81 100644
--- a/llm-service/app/routers/index/sessions/__init__.py
+++ b/llm-service/app/routers/index/sessions/__init__.py
@@ -214,5 +214,11 @@ def generate_stream():
                 {"text": response.message.content, "done": response.delta is None}
             ) + "\n"
 
+    try:
+        stream = generate_stream()
+    finally:
+        print("DONE")
+
+    # kick off evals with full response
     # todo: write to history, start evals, rewrite question, log to mlfow once the response is done
-    return StreamingResponse(generate_stream(), media_type="application/x-ndjson")
+    return StreamingResponse(stream, media_type="application/x-ndjson")
diff --git a/llm-service/app/services/llm_completion.py b/llm-service/app/services/llm_completion.py
index 787469cb..b532e3be 100644
--- a/llm-service/app/services/llm_completion.py
+++ b/llm-service/app/services/llm_completion.py
@@ -36,6 +36,7 @@
 #  DATA.
 #
 import itertools
+from typing import Generator
 
 from llama_index.core.base.llms.types import (
     ChatMessage,
@@ -72,7 +73,7 @@ def completion(session_id: int, question: str, model_name: str) -> ChatResponse:
 
 def stream_completion(
     session_id: int, question: str, model_name: str
-) -> ChatResponseGen:
+) -> Generator[ChatResponse, None, None]:
     """
     Streamed version of the completion function.
     Returns a generator that yields ChatResponse objects as they become available.
@@ -85,7 +86,9 @@ def stream_completion(
         )
     )
     messages.append(ChatMessage.from_str(question, role="user"))
-    return model.stream_chat(messages)
+
+    stream = model.stream_chat(messages)
+    return stream
 
 
 def hypothetical(question: str, configuration: QueryConfiguration) -> str:

From 9cb719f99f66cd5330d3b78fd89f7f1ae5b58e58 Mon Sep 17 00:00:00 2001
From: Elijah Williams <ewilliams@cloudera.com>
Date: Mon, 5 May 2025 14:11:55 -0600
Subject: [PATCH 07/41] progress on generators

---
 .../app/routers/index/sessions/__init__.py    | 21 ++++--
 llm-service/app/services/chat.py              | 74 ++++++++++++++++++-
 ui/src/api/chatApi.ts                         | 21 +++---
 .../FooterComponents/RagChatQueryInput.tsx    | 20 ++++-
 4 files changed, 115 insertions(+), 21 deletions(-)

diff --git a/llm-service/app/routers/index/sessions/__init__.py b/llm-service/app/routers/index/sessions/__init__.py
index d6127b81..c865a62b 100644
--- a/llm-service/app/routers/index/sessions/__init__.py
+++ b/llm-service/app/routers/index/sessions/__init__.py
@@ -48,6 +48,7 @@
 from ....rag_types import RagPredictConfiguration
 from ....services.chat import (
     v2_chat,
+    v3_chat,
 )
 from ....services.chat_history.chat_history_manager import (
     RagStudioChatMessage,
@@ -202,23 +203,27 @@ def chat(
 @exceptions.propagates
 def stream_chat_completion(
     session_id: int,
-    request: StreamCompletionRequest,
+    request: RagStudioChatRequest,
     origin_remote_user: Optional[str] = Header(None),
 ):
     session = session_metadata_api.get_session(session_id, user_name=origin_remote_user)
-    model_name = session.inference_model
+    configuration = request.configuration or RagPredictConfiguration()
 
     def generate_stream():
-        for response in stream_completion(session_id, request.query, model_name):
+        for response in v3_chat(
+            session, request.query, configuration, user_name=origin_remote_user
+        ):
             yield json.dumps(
                 {"text": response.message.content, "done": response.delta is None}
             ) + "\n"
 
-    try:
-        stream = generate_stream()
-    finally:
-        print("DONE")
+    def full_response():
+        yield json.dumps({"text": "Another one", "done": True}) + "\n"
+
+    def combined_gen():
+        yield from generate_stream()
+        yield from full_response()
 
     # kick off evals with full response
     # todo: write to history, start evals, rewrite question, log to mlfow once the response is done
-    return StreamingResponse(stream, media_type="application/x-ndjson")
+    return StreamingResponse(combined_gen(), media_type="application/x-ndjson")
diff --git a/llm-service/app/services/chat.py b/llm-service/app/services/chat.py
index 0eb385e8..6f9fbb10 100644
--- a/llm-service/app/services/chat.py
+++ b/llm-service/app/services/chat.py
@@ -38,10 +38,10 @@
 import time
 import uuid
 from random import shuffle
-from typing import List, Iterable, Optional
+from typing import List, Iterable, Optional, Generator
 
 from fastapi import HTTPException
-from llama_index.core.base.llms.types import MessageRole
+from llama_index.core.base.llms.types import MessageRole, ChatResponse
 from llama_index.core.chat_engine.types import AgentChatResponse
 from pydantic import BaseModel
 
@@ -67,6 +67,44 @@ class RagContext(BaseModel):
     content: str
 
 
+def v3_chat(
+    session: Session,
+    query: str,
+    configuration: RagPredictConfiguration,
+    user_name: Optional[str],
+) -> Generator[ChatResponse, None, RagStudioChatMessage]:
+    query_configuration = QueryConfiguration(
+        top_k=session.response_chunks,
+        model_name=session.inference_model,
+        rerank_model_name=session.rerank_model,
+        exclude_knowledge_base=configuration.exclude_knowledge_base,
+        use_question_condensing=configuration.use_question_condensing,
+        use_hyde=session.query_configuration.enable_hyde,
+        use_summary_filter=session.query_configuration.enable_summary_filter,
+    )
+
+    if configuration.exclude_knowledge_base or len(session.data_source_ids) == 0:
+        return stream_direct_llm_chat(session, query, user_name=user_name)
+
+    total_data_sources_size: int = sum(
+        map(
+            lambda ds_id: VectorStoreFactory.for_chunks(ds_id).size() or 0,
+            session.data_source_ids,
+        )
+    )
+    if total_data_sources_size == 0:
+        return stream_direct_llm_chat(session, query, user_name)
+
+    response_id = str(uuid.uuid4())
+
+    new_chat_message: RagStudioChatMessage = _run_chat(
+        session, response_id, query, query_configuration, user_name
+    )
+
+    chat_history_manager.append_to_history(session.id, [new_chat_message])
+    return new_chat_message
+
+
 def v2_chat(
     session: Session,
     query: str,
@@ -337,3 +375,35 @@ def direct_llm_chat(
     )
     chat_history_manager.append_to_history(session.id, [new_chat_message])
     return new_chat_message
+
+
+def stream_direct_llm_chat(
+    session: Session, query: str, user_name: Optional[str]
+) -> Generator[ChatResponse, None, RagStudioChatMessage]:
+    response_id = str(uuid.uuid4())
+    record_direct_llm_mlflow_run(response_id, session, user_name)
+
+    chat_response = llm_completion.stream_completion(
+        session.id, query, session.inference_model
+    )
+    yield from chat_response
+
+    assistant_message = ""
+    for response in chat_response:
+        assistant_message += response.message.content
+
+    new_chat_message = RagStudioChatMessage(
+        id=response_id,
+        session_id=session.id,
+        source_nodes=[],
+        inference_model=session.inference_model,
+        evaluations=[],
+        rag_message=RagMessage(
+            user=query,
+            assistant=assistant_message,
+        ),
+        timestamp=time.time(),
+        condensed_question=None,
+    )
+    chat_history_manager.append_to_history(session.id, [new_chat_message])
+    return new_chat_message
diff --git a/ui/src/api/chatApi.ts b/ui/src/api/chatApi.ts
index 2123fc0b..5828b81a 100644
--- a/ui/src/api/chatApi.ts
+++ b/ui/src/api/chatApi.ts
@@ -385,11 +385,6 @@ const feedbackMutation = async ({
   );
 };
 
-interface ChatRequest {
-  query: string;
-  sessionId: string;
-}
-
 interface ChatMutationOptions {
   onChunk?: (chunk: string) => void;
 }
@@ -397,12 +392,16 @@ interface ChatMutationOptions {
 export function useStreamChatMutation(options?: ChatMutationOptions) {
   return useMutation({
     mutationKey: [MutationKeys.streamChatMutation],
-    mutationFn: async ({ query, sessionId }: ChatRequest) => {
+    mutationFn: async ({
+      query,
+      configuration,
+      session_id,
+    }: ChatMutationRequest) => {
       const res = await fetch(
-        `${llmServicePath}/sessions/${sessionId}/stream-completion`,
+        `${llmServicePath}/sessions/${session_id.toString()}/stream-completion`,
         {
           method: "POST",
-          body: JSON.stringify({ query }),
+          body: JSON.stringify({ query, configuration }),
           headers: { "Content-Type": "application/json" },
         },
       );
@@ -411,7 +410,6 @@ export function useStreamChatMutation(options?: ChatMutationOptions) {
       const decoder = new TextDecoder();
       let fullResponse = "";
       let done = false;
-
       while (!done) {
         const { value, done: doneReading } = await reader.read();
         done = doneReading;
@@ -419,6 +417,11 @@ export function useStreamChatMutation(options?: ChatMutationOptions) {
         const chunk = decoder.decode(value ?? new Uint8Array(), {
           stream: true,
         });
+        if (doneReading) {
+          console.log("HELLO");
+        } else {
+          console.log("CHUNK", chunk);
+        }
         fullResponse += chunk;
         options?.onChunk?.(chunk);
       }
diff --git a/ui/src/pages/RagChatTab/FooterComponents/RagChatQueryInput.tsx b/ui/src/pages/RagChatTab/FooterComponents/RagChatQueryInput.tsx
index 51cc4a43..80146789 100644
--- a/ui/src/pages/RagChatTab/FooterComponents/RagChatQueryInput.tsx
+++ b/ui/src/pages/RagChatTab/FooterComponents/RagChatQueryInput.tsx
@@ -40,7 +40,11 @@ import { Button, Flex, Input, InputRef, Switch, Tooltip } from "antd";
 import { DatabaseFilled, SendOutlined } from "@ant-design/icons";
 import { useContext, useEffect, useRef, useState } from "react";
 import { RagChatContext } from "pages/RagChatTab/State/RagChatContext.tsx";
-import { createQueryConfiguration, useChatMutation } from "src/api/chatApi.ts";
+import {
+  createQueryConfiguration,
+  useChatMutation,
+  useStreamChatMutation,
+} from "src/api/chatApi.ts";
 import { useParams, useSearch } from "@tanstack/react-router";
 import { cdlBlue600 } from "src/cuix/variables.ts";
 
@@ -62,11 +66,12 @@ const RagChatQueryInput = ({
 
   const [userInput, setUserInput] = useState("");
   const { sessionId } = useParams({ strict: false });
+  const [response, setResponse] = useState("");
   const search: { question?: string } = useSearch({
     strict: false,
   });
   const inputRef = useRef<InputRef>(null);
-
+  // console.log(response);
   const {
     data: sampleQuestions,
     isPending: sampleQuestionsIsPending,
@@ -85,6 +90,12 @@ const RagChatQueryInput = ({
     },
   });
 
+  const streamChatMutation = useStreamChatMutation({
+    onChunk: (chunk) => {
+      setResponse((prev) => prev + chunk);
+    },
+  });
+
   useEffect(() => {
     if (inputRef.current) {
       inputRef.current.focus();
@@ -97,6 +108,11 @@ const RagChatQueryInput = ({
     }
     if (userInput.length > 0) {
       if (sessionId) {
+        streamChatMutation.mutate({
+          query: userInput,
+          configuration: createQueryConfiguration(excludeKnowledgeBase),
+          session_id: +sessionId,
+        });
         chatMutation.mutate({
           query: userInput,
           session_id: +sessionId,

From 8c25d448916b038fbc386de03b820c003dbbcc21 Mon Sep 17 00:00:00 2001
From: Elijah Williams <ewilliams@cloudera.com>
Date: Mon, 5 May 2025 16:47:34 -0600
Subject: [PATCH 08/41] go back to simple streaming only endpoint

---
 llm-service/app/routers/index/sessions/__init__.py | 4 ++--
 llm-service/app/services/chat.py                   | 8 +++-----
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/llm-service/app/routers/index/sessions/__init__.py b/llm-service/app/routers/index/sessions/__init__.py
index c865a62b..a8d2d2a7 100644
--- a/llm-service/app/routers/index/sessions/__init__.py
+++ b/llm-service/app/routers/index/sessions/__init__.py
@@ -205,7 +205,7 @@ def stream_chat_completion(
     session_id: int,
     request: RagStudioChatRequest,
     origin_remote_user: Optional[str] = Header(None),
-):
+) -> StreamingResponse:
     session = session_metadata_api.get_session(session_id, user_name=origin_remote_user)
     configuration = request.configuration or RagPredictConfiguration()
 
@@ -218,7 +218,7 @@ def generate_stream():
             ) + "\n"
 
     def full_response():
-        yield json.dumps({"text": "Another one", "done": True}) + "\n"
+        yield json.dumps({"text": "Done", "done": True}) + "\n"
 
     def combined_gen():
         yield from generate_stream()
diff --git a/llm-service/app/services/chat.py b/llm-service/app/services/chat.py
index 6f9fbb10..c564c155 100644
--- a/llm-service/app/services/chat.py
+++ b/llm-service/app/services/chat.py
@@ -72,7 +72,7 @@ def v3_chat(
     query: str,
     configuration: RagPredictConfiguration,
     user_name: Optional[str],
-) -> Generator[ChatResponse, None, RagStudioChatMessage]:
+) -> Generator[ChatResponse, None, None]:
     query_configuration = QueryConfiguration(
         top_k=session.response_chunks,
         model_name=session.inference_model,
@@ -379,18 +379,17 @@ def direct_llm_chat(
 
 def stream_direct_llm_chat(
     session: Session, query: str, user_name: Optional[str]
-) -> Generator[ChatResponse, None, RagStudioChatMessage]:
+) -> Generator[ChatResponse, None, None]:
     response_id = str(uuid.uuid4())
     record_direct_llm_mlflow_run(response_id, session, user_name)
 
     chat_response = llm_completion.stream_completion(
         session.id, query, session.inference_model
     )
-    yield from chat_response
-
     assistant_message = ""
     for response in chat_response:
         assistant_message += response.message.content
+        yield response
 
     new_chat_message = RagStudioChatMessage(
         id=response_id,
@@ -406,4 +405,3 @@ def stream_direct_llm_chat(
         condensed_question=None,
     )
     chat_history_manager.append_to_history(session.id, [new_chat_message])
-    return new_chat_message

From e25408ddf496b8c79199da925a1fbe2052f35af9 Mon Sep 17 00:00:00 2001
From: jwatson <jkwatson@gmail.com>
Date: Mon, 5 May 2025 16:20:34 -0700
Subject: [PATCH 09/41] wip

lastFile:llm-service/app/services/chat.py
---
 .../app/routers/index/sessions/__init__.py       | 16 ++++++++++++++--
 llm-service/app/services/chat.py                 |  5 ++---
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/llm-service/app/routers/index/sessions/__init__.py b/llm-service/app/routers/index/sessions/__init__.py
index a8d2d2a7..596aa184 100644
--- a/llm-service/app/routers/index/sessions/__init__.py
+++ b/llm-service/app/routers/index/sessions/__init__.py
@@ -40,7 +40,7 @@
 import logging
 from typing import Optional
 
-from fastapi import APIRouter, Header
+from fastapi import APIRouter, Header, HTTPException
 from fastapi.responses import StreamingResponse
 from pydantic import BaseModel
 
@@ -55,7 +55,6 @@
     chat_history_manager,
 )
 from ....services.chat_history.paginator import paginate
-from ....services.llm_completion import stream_completion
 from ....services.metadata_apis import session_metadata_api
 from ....services.mlflow import rating_mlflow_log_metric, feedback_mlflow_log_table
 from ....services.session import rename_session
@@ -103,6 +102,19 @@ def chat_history(
     )
 
 
+@router.get(
+    "/chat-history/{message_id}",
+    summary="Returns a specific chat messages for the provided session.",
+)
+@exceptions.propagates
+def get_message_by_id(session_id: int, message_id: str) -> RagStudioChatMessage:
+    results: list[RagStudioChatMessage] = chat_history_manager.retrieve_chat_history(session_id=session_id)
+    for message in results:
+        if message.id == message_id:
+            return message
+    raise HTTPException(status_code=404, detail=f"Message with id {message_id} not found in session {session_id}")
+
+
 @router.delete(
     "/chat-history", summary="Deletes the chat history for the provided session."
 )
diff --git a/llm-service/app/services/chat.py b/llm-service/app/services/chat.py
index c564c155..f2e93cdf 100644
--- a/llm-service/app/services/chat.py
+++ b/llm-service/app/services/chat.py
@@ -386,9 +386,8 @@ def stream_direct_llm_chat(
     chat_response = llm_completion.stream_completion(
         session.id, query, session.inference_model
     )
-    assistant_message = ""
+    response = ""
     for response in chat_response:
-        assistant_message += response.message.content
         yield response
 
     new_chat_message = RagStudioChatMessage(
@@ -399,7 +398,7 @@ def stream_direct_llm_chat(
         evaluations=[],
         rag_message=RagMessage(
             user=query,
-            assistant=assistant_message,
+            assistant=response.message.content,
         ),
         timestamp=time.time(),
         condensed_question=None,

From 1bd9f5fe2f761b1bedafb73175b7b3add7492680 Mon Sep 17 00:00:00 2001
From: jwatson <jkwatson@gmail.com>
Date: Mon, 5 May 2025 16:39:21 -0700
Subject: [PATCH 10/41] add response id on every chunk returned

lastFile:llm-service/app/routers/index/sessions/__init__.py
---
 llm-service/app/routers/index/sessions/__init__.py | 2 +-
 llm-service/app/services/chat.py                   | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/llm-service/app/routers/index/sessions/__init__.py b/llm-service/app/routers/index/sessions/__init__.py
index 596aa184..6e611fcb 100644
--- a/llm-service/app/routers/index/sessions/__init__.py
+++ b/llm-service/app/routers/index/sessions/__init__.py
@@ -226,7 +226,7 @@ def generate_stream():
             session, request.query, configuration, user_name=origin_remote_user
         ):
             yield json.dumps(
-                {"text": response.message.content, "done": response.delta is None}
+                {"text": response.message.content, "response_id": response.additional_kwargs["response_id"], "done": response.delta is None}
             ) + "\n"
 
     def full_response():
diff --git a/llm-service/app/services/chat.py b/llm-service/app/services/chat.py
index f2e93cdf..68181653 100644
--- a/llm-service/app/services/chat.py
+++ b/llm-service/app/services/chat.py
@@ -41,7 +41,7 @@
 from typing import List, Iterable, Optional, Generator
 
 from fastapi import HTTPException
-from llama_index.core.base.llms.types import MessageRole, ChatResponse
+from llama_index.core.base.llms.types import MessageRole, ChatResponse, ChatMessage
 from llama_index.core.chat_engine.types import AgentChatResponse
 from pydantic import BaseModel
 
@@ -386,8 +386,9 @@ def stream_direct_llm_chat(
     chat_response = llm_completion.stream_completion(
         session.id, query, session.inference_model
     )
-    response = ""
+    response: ChatResponse = ChatResponse(message=ChatMessage(content=query))
     for response in chat_response:
+        response.additional_kwargs["response_id"] = response_id
         yield response
 
     new_chat_message = RagStudioChatMessage(

From 4e9b8ef21730c38e194f31fc488eeb50d23a451e Mon Sep 17 00:00:00 2001
From: jwatson <jkwatson@gmail.com>
Date: Mon, 5 May 2025 16:42:40 -0700
Subject: [PATCH 11/41] remove duplicate calls, but still not rendering

---
 ui/src/api/chatApi.ts                               | 10 +++++-----
 .../FooterComponents/RagChatQueryInput.tsx          | 13 +++++++------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/ui/src/api/chatApi.ts b/ui/src/api/chatApi.ts
index 5828b81a..83efc4c9 100644
--- a/ui/src/api/chatApi.ts
+++ b/ui/src/api/chatApi.ts
@@ -417,11 +417,11 @@ export function useStreamChatMutation(options?: ChatMutationOptions) {
         const chunk = decoder.decode(value ?? new Uint8Array(), {
           stream: true,
         });
-        if (doneReading) {
-          console.log("HELLO");
-        } else {
-          console.log("CHUNK", chunk);
-        }
+        // if (doneReading) {
+        //   console.log("HELLO");
+        // } else {
+        //   console.log("CHUNK", chunk);
+        // }
         fullResponse += chunk;
         options?.onChunk?.(chunk);
       }
diff --git a/ui/src/pages/RagChatTab/FooterComponents/RagChatQueryInput.tsx b/ui/src/pages/RagChatTab/FooterComponents/RagChatQueryInput.tsx
index 80146789..fe14d1a1 100644
--- a/ui/src/pages/RagChatTab/FooterComponents/RagChatQueryInput.tsx
+++ b/ui/src/pages/RagChatTab/FooterComponents/RagChatQueryInput.tsx
@@ -92,7 +92,8 @@ const RagChatQueryInput = ({
 
   const streamChatMutation = useStreamChatMutation({
     onChunk: (chunk) => {
-      setResponse((prev) => prev + chunk);
+      console.log("stream chunk", chunk);
+      setResponse(() => chunk);
     },
   });
 
@@ -113,11 +114,11 @@ const RagChatQueryInput = ({
           configuration: createQueryConfiguration(excludeKnowledgeBase),
           session_id: +sessionId,
         });
-        chatMutation.mutate({
-          query: userInput,
-          session_id: +sessionId,
-          configuration: createQueryConfiguration(excludeKnowledgeBase),
-        });
+        // chatMutation.mutate({
+        //   query: userInput,
+        //   session_id: +sessionId,
+        //   configuration: createQueryConfiguration(excludeKnowledgeBase),
+        // });
       } else {
         newSessionCallback(userInput);
       }

From 3b915220160eb23a4a3357f888e655dff07a919a Mon Sep 17 00:00:00 2001
From: Elijah Williams <ewilliams@cloudera.com>
Date: Tue, 6 May 2025 11:31:07 -0600
Subject: [PATCH 12/41] getting there

---
 .../app/routers/index/sessions/__init__.py    |  26 ++--
 ui/package.json                               |   3 +
 ui/pnpm-lock.yaml                             |  88 ++++++++++++
 ui/src/api/chatApi.ts                         | 128 +++++++++++++++++-
 ui/src/pages/RagChatTab/ChatLayout.tsx        |   2 +
 .../ChatOutput/ChatMessages/ChatMessage.tsx   |   2 +
 .../Loaders/PendingRagOutputSkeleton.tsx      |  30 +++-
 .../FooterComponents/RagChatQueryInput.tsx    |  26 ++--
 .../pages/RagChatTab/State/RagChatContext.tsx |   2 +
 9 files changed, 274 insertions(+), 33 deletions(-)

diff --git a/llm-service/app/routers/index/sessions/__init__.py b/llm-service/app/routers/index/sessions/__init__.py
index 6e611fcb..485270b9 100644
--- a/llm-service/app/routers/index/sessions/__init__.py
+++ b/llm-service/app/routers/index/sessions/__init__.py
@@ -108,11 +108,16 @@ def chat_history(
 )
 @exceptions.propagates
 def get_message_by_id(session_id: int, message_id: str) -> RagStudioChatMessage:
-    results: list[RagStudioChatMessage] = chat_history_manager.retrieve_chat_history(session_id=session_id)
+    results: list[RagStudioChatMessage] = chat_history_manager.retrieve_chat_history(
+        session_id=session_id
+    )
     for message in results:
         if message.id == message_id:
             return message
-    raise HTTPException(status_code=404, detail=f"Message with id {message_id} not found in session {session_id}")
+    raise HTTPException(
+        status_code=404,
+        detail=f"Message with id {message_id} not found in session {session_id}",
+    )
 
 
 @router.delete(
@@ -222,20 +227,15 @@ def stream_chat_completion(
     configuration = request.configuration or RagPredictConfiguration()
 
     def generate_stream():
+        response_id: str = ""
         for response in v3_chat(
             session, request.query, configuration, user_name=origin_remote_user
         ):
-            yield json.dumps(
-                {"text": response.message.content, "response_id": response.additional_kwargs["response_id"], "done": response.delta is None}
-            ) + "\n"
-
-    def full_response():
-        yield json.dumps({"text": "Done", "done": True}) + "\n"
-
-    def combined_gen():
-        yield from generate_stream()
-        yield from full_response()
+            print(response.delta)
+            response_id = response.additional_kwargs["response_id"]
+            yield f"data: {response.delta}" + "\n\n"
+        yield f"data: {response_id}" + "\n\n"
 
     # kick off evals with full response
     # todo: write to history, start evals, rewrite question, log to mlfow once the response is done
-    return StreamingResponse(combined_gen(), media_type="application/x-ndjson")
+    return StreamingResponse(generate_stream(), media_type="text/event-stream")
diff --git a/ui/package.json b/ui/package.json
index 30038622..16dc2366 100644
--- a/ui/package.json
+++ b/ui/package.json
@@ -21,6 +21,7 @@
     "@ant-design/icons": "^5.5.1",
     "@emotion/react": "^11.14.0",
     "@emotion/styled": "^11.14.0",
+    "@microsoft/fetch-event-source": "^2.0.1",
     "@mui/material": "^6.4.3",
     "@mui/x-charts": "^7.26.0",
     "@tanstack/react-query": "^5.59.20",
@@ -29,6 +30,7 @@
     "antd": "^5.24.6",
     "date-fns": "^4.1.0",
     "lodash": "^4.17.21",
+    "ndjson": "^2.0.0",
     "react": "^18.3.1",
     "react-dom": "^18.3.1",
     "react-intersection-observer": "^9.16.0",
@@ -48,6 +50,7 @@
     "@testing-library/user-event": "^14.5.2",
     "@types/eslint__js": "^8.42.3",
     "@types/lodash": "^4.17.13",
+    "@types/ndjson": "^2.0.4",
     "@types/node": "^22.9.0",
     "@types/react": "^18.3.12",
     "@types/react-dom": "^18.3.1",
diff --git a/ui/pnpm-lock.yaml b/ui/pnpm-lock.yaml
index 0a9c6b26..a29c8bc6 100644
--- a/ui/pnpm-lock.yaml
+++ b/ui/pnpm-lock.yaml
@@ -17,6 +17,9 @@ importers:
       '@emotion/styled':
         specifier: ^11.14.0
         version: 11.14.0(@emotion/react@11.14.0(@types/react@18.3.12)(react@18.3.1))(@types/react@18.3.12)(react@18.3.1)
+      '@microsoft/fetch-event-source':
+        specifier: ^2.0.1
+        version: 2.0.1
       '@mui/material':
         specifier: ^6.4.3
         version: 6.4.3(@emotion/react@11.14.0(@types/react@18.3.12)(react@18.3.1))(@emotion/styled@11.14.0(@emotion/react@11.14.0(@types/react@18.3.12)(react@18.3.1))(@types/react@18.3.12)(react@18.3.1))(@types/react@18.3.12)(react-dom@18.3.1(react@18.3.1))(react@18.3.1)
@@ -41,6 +44,9 @@ importers:
       lodash:
         specifier: ^4.17.21
         version: 4.17.21
+      ndjson:
+        specifier: ^2.0.0
+        version: 2.0.0
       react:
         specifier: ^18.3.1
         version: 18.3.1
@@ -93,6 +99,9 @@ importers:
       '@types/lodash':
         specifier: ^4.17.13
         version: 4.17.13
+      '@types/ndjson':
+        specifier: ^2.0.4
+        version: 2.0.4
       '@types/node':
         specifier: ^22.9.0
         version: 22.9.0
@@ -768,6 +777,9 @@ packages:
   '@jridgewell/trace-mapping@0.3.25':
     resolution: {integrity: sha512-vNk6aEwybGtawWmy/PzwnGDOjCkLWSD2wqvjGGAgOAwCGWySYXfYoxt00IJkTF+8Lb57DwOb3Aa0o9CApepiYQ==}
 
+  '@microsoft/fetch-event-source@2.0.1':
+    resolution: {integrity: sha512-W6CLUJ2eBMw3Rec70qrsEW0jOm/3twwJv21mrmj2yORiaVmVYGS4sSS5yUwvQc1ZlDLYGPnClVWmUUMagKNsfA==}
+
   '@mui/core-downloads-tracker@6.4.3':
     resolution: {integrity: sha512-hlyOzo2ObarllAOeT1ZSAusADE5NZNencUeIvXrdQ1Na+FL1lcznhbxfV5He1KqGiuR8Az3xtCUcYKwMVGFdzg==}
 
@@ -1439,6 +1451,9 @@ packages:
   '@types/ms@0.7.34':
     resolution: {integrity: sha512-nG96G3Wp6acyAgJqGasjODb+acrI7KltPiRxzHPXnP3NgI28bpQDRv53olbqGXbfcgF5aiiHmO3xpwEpS5Ld9g==}
 
+  '@types/ndjson@2.0.4':
+    resolution: {integrity: sha512-ajAl7AjhFstF6waORYNSS49GL5iBKisqJlgvXuprXFKCX9fto4ordlNU3+XMgkMddgeR0WoQQBmKUk0v0dJ4pw==}
+
   '@types/node@22.9.0':
     resolution: {integrity: sha512-vuyHg81vvWA1Z1ELfvLko2c8f34gyA0zaic0+Rllc5lbCnbSyuvb2Oxpm6TAUAC/2xZN3QGqxBNggD1nNR2AfQ==}
 
@@ -1468,6 +1483,9 @@ packages:
   '@types/swagger-ui-react@5.18.0':
     resolution: {integrity: sha512-c2M9adVG7t28t1pq19K9Jt20VLQf0P/fwJwnfcmsVVsdkwCWhRmbKDu+tIs0/NGwJ/7GY8lBx+iKZxuDI5gDbw==}
 
+  '@types/through@0.0.33':
+    resolution: {integrity: sha512-HsJ+z3QuETzP3cswwtzt2vEIiHBk/dCcHGhbmG5X3ecnwFD/lPrMpliGXxSCg03L9AhrdwA4Oz/qfspkDW+xGQ==}
+
   '@types/trusted-types@2.0.7':
     resolution: {integrity: sha512-ScaPdn1dQczgbl0QFTeTOmVHFULt394XJgOQNoyVhZ6r2vLnMLJfBPd53SB52T/3G36VI1/g2MZaX0cwDuXsfw==}
 
@@ -2663,6 +2681,9 @@ packages:
   json-stable-stringify-without-jsonify@1.0.1:
     resolution: {integrity: sha512-Bdboy+l7tA3OGW6FjyFHWkP5LuByj1Tk33Ljyq0axyzdk9//JSi2u3fP1QSmd1KNwq6VOKYGlAu87CisVir6Pw==}
 
+  json-stringify-safe@5.0.1:
+    resolution: {integrity: sha512-ZClg6AaYvamvYEE82d3Iyd3vSSIjQ+odgjaTzRuO3s7toCdFKczob2i0zCh7JE8kWn17yvAWhUVxvqGwUalsRA==}
+
   json2mq@0.2.0:
     resolution: {integrity: sha512-SzoRg7ux5DWTII9J2qkrZrqV1gt+rTaoufMxEzXbS26Uid0NwaJd123HcoB80TgubEppxxIGdNxCx50fEoEWQA==}
 
@@ -2919,6 +2940,9 @@ packages:
     resolution: {integrity: sha512-G6T0ZX48xgozx7587koeX9Ys2NYy6Gmv//P89sEte9V9whIapMNF4idKxnW2QtCcLiTWlb/wfCabAtAFWhhBow==}
     engines: {node: '>=16 || 14 >=14.17'}
 
+  minimist@1.2.8:
+    resolution: {integrity: sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==}
+
   ms@2.1.3:
     resolution: {integrity: sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==}
 
@@ -2930,6 +2954,11 @@ packages:
   natural-compare@1.4.0:
     resolution: {integrity: sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw==}
 
+  ndjson@2.0.0:
+    resolution: {integrity: sha512-nGl7LRGrzugTtaFcJMhLbpzJM6XdivmbkdlaGcrk/LXg2KL/YBC6z1g70xh0/al+oFuVFP8N8kiWRucmeEH/qQ==}
+    engines: {node: '>=10'}
+    hasBin: true
+
   neotraverse@0.6.18:
     resolution: {integrity: sha512-Z4SmBUweYa09+o6pG+eASabEpP6QkQ70yHj351pQoEXIs8uHbaU2DWVmzBANKgflPa47A50PtB2+NgRpQvr7vA==}
     engines: {node: '>= 10'}
@@ -3495,6 +3524,10 @@ packages:
     resolution: {integrity: sha512-wS+hAgJShR0KhEvPJArfuPVN1+Hz1t0Y6n5jLrGQbkb4urgPE/0Rve+1kMB1v/oWgHgm4WIcV+i7F2pTVj+2iQ==}
     engines: {node: '>=0.10.0'}
 
+  readable-stream@3.6.2:
+    resolution: {integrity: sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==}
+    engines: {node: '>= 6'}
+
   readdirp@3.6.0:
     resolution: {integrity: sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==}
     engines: {node: '>=8.10.0'}
@@ -3697,6 +3730,9 @@ packages:
   space-separated-tokens@2.0.2:
     resolution: {integrity: sha512-PEGlAwrG8yXGXRjW32fGbg66JAlOAwbObuqVoJpv/mRgoWDQfgH1wDPvtzWyUSNAXBGSk8h755YDbbcEy3SH2Q==}
 
+  split2@3.2.2:
+    resolution: {integrity: sha512-9NThjpgZnifTkJpzTZ7Eue85S49QwpNhZTq6GRJwObb6jnLFNGB7Qm73V5HewTROPyxD0C29xqmaI68bQtV+hg==}
+
   sprintf-js@1.0.3:
     resolution: {integrity: sha512-D9cPgkvLlV3t3IzL0D0YLvGA9Ahk4PcvVwUbN0dSGr1aP0Nrt4AEnTUbuGvquEC0mA64Gqt1fzirlRs5ibXx8g==}
 
@@ -3735,6 +3771,9 @@ packages:
     resolution: {integrity: sha512-UXSH262CSZY1tfu3G3Secr6uGLCFVPMhIqHjlgCUtCCcgihYc/xKs9djMTMUOb2j1mVSeU8EU6NWc/iQKU6Gfg==}
     engines: {node: '>= 0.4'}
 
+  string_decoder@1.3.0:
+    resolution: {integrity: sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==}
+
   stringify-entities@4.0.4:
     resolution: {integrity: sha512-IwfBptatlO+QCJUo19AqvrPNqlVMpW9YEL2LIVY+Rpv2qsjCGxaDLNRgeGsQWJhfItebuJhsGSLjaBbNSQ+ieg==}
 
@@ -3797,6 +3836,9 @@ packages:
     resolution: {integrity: sha512-B71/4oyj61iNH0KeCamLuE2rmKuTO5byTOSVwECM5FA7TiAiAW+UqTKZ9ERueC4qvgSttUhdmq1mXC3kJqGX7A==}
     engines: {node: '>=12.22'}
 
+  through2@4.0.2:
+    resolution: {integrity: sha512-iOqSav00cVxEEICeD7TjLB1sueEL+81Wpzp2bY17uZjZN0pWZPuo4suZ/61VujxmqSGFfgOcNuTZ85QJwNZQpw==}
+
   tiny-invariant@1.3.3:
     resolution: {integrity: sha512-+FbBPE1o9QAYvviau/qC5SE3caw21q3xkvWKBtja5vgqOWIHHJ3ioaq1VPfn/Szqctz2bU/oYeKd9/z5BL+PVg==}
 
@@ -3989,6 +4031,9 @@ packages:
     peerDependencies:
       react: ^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0
 
+  util-deprecate@1.0.2:
+    resolution: {integrity: sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==}
+
   uuid@11.0.5:
     resolution: {integrity: sha512-508e6IcKLrhxKdBbcA2b4KQZlLVp2+J5UwQ6F7Drckkc5N9ZJwFa4TgWtsww9UG8fGHbm6gbV19TdM5pQ4GaIA==}
     hasBin: true
@@ -4719,6 +4764,8 @@ snapshots:
       '@jridgewell/resolve-uri': 3.1.2
       '@jridgewell/sourcemap-codec': 1.5.0
 
+  '@microsoft/fetch-event-source@2.0.1': {}
+
   '@mui/core-downloads-tracker@6.4.3': {}
 
   '@mui/material@6.4.3(@emotion/react@11.14.0(@types/react@18.3.12)(react@18.3.1))(@emotion/styled@11.14.0(@emotion/react@11.14.0(@types/react@18.3.12)(react@18.3.1))(@types/react@18.3.12)(react@18.3.1))(@types/react@18.3.12)(react-dom@18.3.1(react@18.3.1))(react@18.3.1)':
@@ -5676,6 +5723,11 @@ snapshots:
 
   '@types/ms@0.7.34': {}
 
+  '@types/ndjson@2.0.4':
+    dependencies:
+      '@types/node': 22.9.0
+      '@types/through': 0.0.33
+
   '@types/node@22.9.0':
     dependencies:
       undici-types: 6.19.8
@@ -5707,6 +5759,10 @@ snapshots:
     dependencies:
       '@types/react': 18.3.12
 
+  '@types/through@0.0.33':
+    dependencies:
+      '@types/node': 22.9.0
+
   '@types/trusted-types@2.0.7':
     optional: true
 
@@ -7114,6 +7170,8 @@ snapshots:
 
   json-stable-stringify-without-jsonify@1.0.1: {}
 
+  json-stringify-safe@5.0.1: {}
+
   json2mq@0.2.0:
     dependencies:
       string-convert: 0.2.1
@@ -7592,12 +7650,22 @@ snapshots:
     dependencies:
       brace-expansion: 2.0.1
 
+  minimist@1.2.8: {}
+
   ms@2.1.3: {}
 
   nanoid@3.3.11: {}
 
   natural-compare@1.4.0: {}
 
+  ndjson@2.0.0:
+    dependencies:
+      json-stringify-safe: 5.0.1
+      minimist: 1.2.8
+      readable-stream: 3.6.2
+      split2: 3.2.2
+      through2: 4.0.2
+
   neotraverse@0.6.18: {}
 
   no-case@3.0.4:
@@ -8247,6 +8315,12 @@ snapshots:
     dependencies:
       loose-envify: 1.4.0
 
+  readable-stream@3.6.2:
+    dependencies:
+      inherits: 2.0.4
+      string_decoder: 1.3.0
+      util-deprecate: 1.0.2
+
   readdirp@3.6.0:
     dependencies:
       picomatch: 2.3.1
@@ -8495,6 +8569,10 @@ snapshots:
 
   space-separated-tokens@2.0.2: {}
 
+  split2@3.2.2:
+    dependencies:
+      readable-stream: 3.6.2
+
   sprintf-js@1.0.3: {}
 
   stackback@0.0.2: {}
@@ -8550,6 +8628,10 @@ snapshots:
       define-properties: 1.2.1
       es-object-atoms: 1.0.0
 
+  string_decoder@1.3.0:
+    dependencies:
+      safe-buffer: 5.2.1
+
   stringify-entities@4.0.4:
     dependencies:
       character-entities-html4: 2.1.0
@@ -8658,6 +8740,10 @@ snapshots:
 
   throttle-debounce@5.0.2: {}
 
+  through2@4.0.2:
+    dependencies:
+      readable-stream: 3.6.2
+
   tiny-invariant@1.3.3: {}
 
   tiny-warning@1.0.3: {}
@@ -8858,6 +8944,8 @@ snapshots:
     dependencies:
       react: 18.3.1
 
+  util-deprecate@1.0.2: {}
+
   uuid@11.0.5: {}
 
   vfile-message@4.0.2:
diff --git a/ui/src/api/chatApi.ts b/ui/src/api/chatApi.ts
index 83efc4c9..4e3c1c78 100644
--- a/ui/src/api/chatApi.ts
+++ b/ui/src/api/chatApi.ts
@@ -52,6 +52,11 @@ import {
   useQueryClient,
 } from "@tanstack/react-query";
 import { suggestedQuestionKey } from "src/api/ragQueryApi.ts";
+import {
+  EventStreamContentType,
+  fetchEventSource,
+} from "@microsoft/fetch-event-source";
+import { Dispatch, SetStateAction } from "react";
 
 export interface SourceNode {
   node_id: string;
@@ -389,6 +394,12 @@ interface ChatMutationOptions {
   onChunk?: (chunk: string) => void;
 }
 
+export interface ChatMutationResponse {
+  text: string;
+  response_id: string;
+  done: boolean;
+}
+
 export function useStreamChatMutation(options?: ChatMutationOptions) {
   return useMutation({
     mutationKey: [MutationKeys.streamChatMutation],
@@ -417,15 +428,128 @@ export function useStreamChatMutation(options?: ChatMutationOptions) {
         const chunk = decoder.decode(value ?? new Uint8Array(), {
           stream: true,
         });
+        const parsedChunk = JSON.parse(chunk) as ChatMutationResponse;
         // if (doneReading) {
         //   console.log("HELLO");
         // } else {
         //   console.log("CHUNK", chunk);
         // }
-        fullResponse += chunk;
-        options?.onChunk?.(chunk);
+        console.log(parsedChunk);
+        fullResponse += parsedChunk.text;
+        options?.onChunk?.(parsedChunk.text);
       }
       return fullResponse;
     },
   });
 }
+
+export const useChatMutationV2 = ({
+  onSuccess,
+  onError,
+  onChunk,
+}: UseMutationType<ChatMessageType> & { onChunk: (msg: string) => void }) => {
+  const queryClient = useQueryClient();
+  return useMutation({
+    mutationKey: [MutationKeys.chatMutation],
+    mutationFn: (request: ChatMutationRequest) =>
+      chatMutationV2(request, onChunk),
+    onMutate: (variables) => {
+      queryClient.setQueryData<InfiniteData<ChatHistoryResponse>>(
+        chatHistoryQueryKey({
+          session_id: variables.session_id,
+        }),
+        (cachedData) =>
+          appendPlaceholderToChatHistory(variables.query, cachedData),
+      );
+    },
+    onSuccess: (data, variables) => {
+      // queryClient.setQueryData<InfiniteData<ChatHistoryResponse>>(
+      //   chatHistoryQueryKey({
+      //     session_id: variables.request.session_id,
+      //   }),
+      //   (cachedData) => replacePlaceholderInChatHistory(data, cachedData),
+      // );
+      queryClient
+        .invalidateQueries({
+          queryKey: suggestedQuestionKey(variables.session_id),
+        })
+        .catch((error: unknown) => {
+          console.error(error);
+        });
+    },
+    onError: (error: Error, variables) => {
+      const uuid = crypto.randomUUID();
+      const errorMessage: ChatMessageType = {
+        id: `error-${uuid}`,
+        session_id: variables.session_id,
+        source_nodes: [],
+        rag_message: {
+          user: variables.query,
+          assistant: error.message,
+        },
+        evaluations: [],
+        timestamp: Date.now(),
+      };
+      queryClient.setQueryData<InfiniteData<ChatHistoryResponse>>(
+        chatHistoryQueryKey({
+          session_id: variables.session_id,
+          offset: 0,
+        }),
+        (cachedData) =>
+          replacePlaceholderInChatHistory(errorMessage, cachedData),
+      );
+
+      onError?.(error);
+    },
+  });
+};
+
+const chatMutationV2 = async (
+  request: ChatMutationRequest,
+  onChunk: (chunk: string) => void,
+): Promise<void> => {
+  const ctrl = new AbortController();
+  await fetchEventSource(
+    `${llmServicePath}/sessions/${request.session_id.toString()}/stream-completion`,
+    {
+      method: "POST",
+      headers: {
+        "Content-Type": "application/json",
+      },
+      body: JSON.stringify({
+        query: request.query,
+        configuration: request.configuration,
+      }),
+      signal: ctrl.signal,
+      onmessage(msg) {
+        onChunk(msg.data);
+      },
+      onclose() {
+        console.log("Connection closed");
+      },
+      onerror(err) {
+        console.error("Error", err);
+        ctrl.abort();
+      },
+      async onopen(response) {
+        if (
+          response.ok &&
+          response.headers.get("content-type")?.includes(EventStreamContentType)
+        ) {
+          await Promise.resolve();
+          console.log("all good");
+          return; // everything's good
+        } else if (
+          response.status >= 400 &&
+          response.status < 500 &&
+          response.status !== 429
+        ) {
+          // client-side errors are usually non-retriable:
+          throw new Error();
+        } else {
+          throw new Error();
+        }
+      },
+    },
+  );
+};
diff --git a/ui/src/pages/RagChatTab/ChatLayout.tsx b/ui/src/pages/RagChatTab/ChatLayout.tsx
index 0042f44b..ffb5247e 100644
--- a/ui/src/pages/RagChatTab/ChatLayout.tsx
+++ b/ui/src/pages/RagChatTab/ChatLayout.tsx
@@ -73,6 +73,7 @@ function ChatLayout() {
   const { data: dataSources, status: dataSourcesStatus } =
     useGetDataSourcesForProject(+projectId);
   const [excludeKnowledgeBase, setExcludeKnowledgeBase] = useState(false);
+  const [streamedChat, setStreamedChat] = useState("");
   const {
     status: chatHistoryStatus,
     data: chatHistory,
@@ -108,6 +109,7 @@ function ChatLayout() {
           isFetching,
           isFetchingPreviousPage,
         },
+        streamedChatState: [streamedChat, setStreamedChat],
         dataSourceSize,
         dataSourcesQuery: {
           dataSources: dataSources ?? [],
diff --git a/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessage.tsx b/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessage.tsx
index bf4fab20..988b8b52 100644
--- a/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessage.tsx
+++ b/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessage.tsx
@@ -50,6 +50,8 @@ import Markdown from "react-markdown";
 
 import "../tableMarkdown.css";
 import { ExclamationCircleTwoTone } from "@ant-design/icons";
+import { useContext } from "react";
+import { RagChatContext } from "pages/RagChatTab/State/RagChatContext.tsx";
 
 const isError = (data: ChatMessageType) => {
   return data.id.startsWith("error-");
diff --git a/ui/src/pages/RagChatTab/ChatOutput/Loaders/PendingRagOutputSkeleton.tsx b/ui/src/pages/RagChatTab/ChatOutput/Loaders/PendingRagOutputSkeleton.tsx
index b39f23f1..975dee79 100644
--- a/ui/src/pages/RagChatTab/ChatOutput/Loaders/PendingRagOutputSkeleton.tsx
+++ b/ui/src/pages/RagChatTab/ChatOutput/Loaders/PendingRagOutputSkeleton.tsx
@@ -36,19 +36,37 @@
  * DATA.
  ******************************************************************************/
 
-import { Divider, Row, Skeleton } from "antd";
+import { Divider, Row, Skeleton, Typography } from "antd";
 import UserQuestion from "pages/RagChatTab/ChatOutput/ChatMessages/UserQuestion.tsx";
+import { useContext } from "react";
+import { RagChatContext } from "pages/RagChatTab/State/RagChatContext.tsx";
+import Markdown from "react-markdown";
+import Remark from "remark-gfm";
 
 const PendingRagOutputSkeleton = ({ question }: { question: string }) => {
+  const {
+    streamedChatState: [streamedChat],
+  } = useContext(RagChatContext);
+  console.log(streamedChat);
   return (
     <div style={{ width: "100%" }}>
       <div>
         <UserQuestion question={question} />
-        <Row>
-          <Skeleton active />
-          <Skeleton active />
-          <Skeleton active />
-        </Row>
+        {streamedChat ? (
+          <Typography.Text style={{ fontSize: 16, marginTop: 8 }}>
+            <Markdown
+              skipHtml
+              remarkPlugins={[Remark]}
+              className="styled-markdown"
+            >
+              {streamedChat}
+            </Markdown>
+          </Typography.Text>
+        ) : (
+          <Row>
+            <Skeleton active />
+          </Row>
+        )}
       </div>
       <Divider />
     </div>
diff --git a/ui/src/pages/RagChatTab/FooterComponents/RagChatQueryInput.tsx b/ui/src/pages/RagChatTab/FooterComponents/RagChatQueryInput.tsx
index fe14d1a1..65264e54 100644
--- a/ui/src/pages/RagChatTab/FooterComponents/RagChatQueryInput.tsx
+++ b/ui/src/pages/RagChatTab/FooterComponents/RagChatQueryInput.tsx
@@ -39,10 +39,13 @@
 import { Button, Flex, Input, InputRef, Switch, Tooltip } from "antd";
 import { DatabaseFilled, SendOutlined } from "@ant-design/icons";
 import { useContext, useEffect, useRef, useState } from "react";
+import ndjson from "ndjson";
 import { RagChatContext } from "pages/RagChatTab/State/RagChatContext.tsx";
 import {
+  ChatMutationResponse,
   createQueryConfiguration,
   useChatMutation,
+  useChatMutationV2,
   useStreamChatMutation,
 } from "src/api/chatApi.ts";
 import { useParams, useSearch } from "@tanstack/react-router";
@@ -51,6 +54,11 @@ import { cdlBlue600 } from "src/cuix/variables.ts";
 import type { SwitchChangeEventHandler } from "antd/lib/switch";
 import { useSuggestQuestions } from "src/api/ragQueryApi.ts";
 import SuggestedQuestionsFooter from "pages/RagChatTab/FooterComponents/SuggestedQuestionsFooter.tsx";
+import { llmServicePath } from "src/api/utils.ts";
+import {
+  EventStreamContentType,
+  fetchEventSource,
+} from "@microsoft/fetch-event-source";
 
 const RagChatQueryInput = ({
   newSessionCallback,
@@ -62,16 +70,15 @@ const RagChatQueryInput = ({
     chatHistoryQuery: { flatChatHistory },
     dataSourceSize,
     dataSourcesQuery: { dataSourcesStatus },
+    streamedChatState: [, setStreamedChat],
   } = useContext(RagChatContext);
 
   const [userInput, setUserInput] = useState("");
   const { sessionId } = useParams({ strict: false });
-  const [response, setResponse] = useState("");
   const search: { question?: string } = useSearch({
     strict: false,
   });
   const inputRef = useRef<InputRef>(null);
-  // console.log(response);
   const {
     data: sampleQuestions,
     isPending: sampleQuestionsIsPending,
@@ -90,10 +97,10 @@ const RagChatQueryInput = ({
     },
   });
 
-  const streamChatMutation = useStreamChatMutation({
+  const streamChatMutation = useChatMutationV2({
     onChunk: (chunk) => {
-      console.log("stream chunk", chunk);
-      setResponse(() => chunk);
+      console.log("chunk", chunk);
+      setStreamedChat((prev) => prev + chunk);
     },
   });
 
@@ -103,7 +110,7 @@ const RagChatQueryInput = ({
     }
   }, [inputRef.current, flatChatHistory.length]);
 
-  const handleChat = (userInput: string) => {
+  const handleChat = async (userInput: string) => {
     if (userInput.trim().length <= 0) {
       return;
     }
@@ -111,14 +118,9 @@ const RagChatQueryInput = ({
       if (sessionId) {
         streamChatMutation.mutate({
           query: userInput,
-          configuration: createQueryConfiguration(excludeKnowledgeBase),
           session_id: +sessionId,
+          configuration: createQueryConfiguration(excludeKnowledgeBase),
         });
-        // chatMutation.mutate({
-        //   query: userInput,
-        //   session_id: +sessionId,
-        //   configuration: createQueryConfiguration(excludeKnowledgeBase),
-        // });
       } else {
         newSessionCallback(userInput);
       }
diff --git a/ui/src/pages/RagChatTab/State/RagChatContext.tsx b/ui/src/pages/RagChatTab/State/RagChatContext.tsx
index c61ce7a8..9df858aa 100644
--- a/ui/src/pages/RagChatTab/State/RagChatContext.tsx
+++ b/ui/src/pages/RagChatTab/State/RagChatContext.tsx
@@ -59,6 +59,7 @@ export interface RagChatContextType {
       InfiniteQueryObserverResult<InfiniteData<ChatHistoryResponse>>
     >;
   };
+  streamedChatState: [string, Dispatch<SetStateAction<string>>];
   dataSourcesQuery: {
     dataSources: DataSourceType[];
     dataSourcesStatus?: "error" | "success" | "pending";
@@ -79,6 +80,7 @@ export const RagChatContext = createContext<RagChatContextType>({
         {} as InfiniteQueryObserverResult<InfiniteData<ChatHistoryResponse>>,
       ),
   },
+  streamedChatState: ["", () => null],
   dataSourcesQuery: { dataSources: [], dataSourcesStatus: undefined },
   dataSourceSize: null,
   excludeKnowledgeBaseState: [false, () => null],

From df0203af500693e001f5723e0d369b9c68716e57 Mon Sep 17 00:00:00 2001
From: Michael Liu <mliu@cloudera.com>
Date: Tue, 6 May 2025 10:36:45 -0700
Subject: [PATCH 13/41] Consolidate response_id generation

---
 llm-service/app/services/chat.py | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/llm-service/app/services/chat.py b/llm-service/app/services/chat.py
index 68181653..2d5c7b62 100644
--- a/llm-service/app/services/chat.py
+++ b/llm-service/app/services/chat.py
@@ -83,8 +83,10 @@ def v3_chat(
         use_summary_filter=session.query_configuration.enable_summary_filter,
     )
 
+    response_id = str(uuid.uuid4())
+
     if configuration.exclude_knowledge_base or len(session.data_source_ids) == 0:
-        return stream_direct_llm_chat(session, query, user_name=user_name)
+        return stream_direct_llm_chat(session, response_id, query, user_name)
 
     total_data_sources_size: int = sum(
         map(
@@ -93,9 +95,7 @@ def v3_chat(
         )
     )
     if total_data_sources_size == 0:
-        return stream_direct_llm_chat(session, query, user_name)
-
-    response_id = str(uuid.uuid4())
+        return stream_direct_llm_chat(session, response_id, query, user_name)
 
     new_chat_message: RagStudioChatMessage = _run_chat(
         session, response_id, query, query_configuration, user_name
@@ -121,8 +121,10 @@ def v2_chat(
         use_summary_filter=session.query_configuration.enable_summary_filter,
     )
 
+    response_id = str(uuid.uuid4())
+
     if configuration.exclude_knowledge_base or len(session.data_source_ids) == 0:
-        return direct_llm_chat(session, query, user_name=user_name)
+        return direct_llm_chat(session, response_id, query, user_name)
 
     total_data_sources_size: int = sum(
         map(
@@ -131,9 +133,7 @@ def v2_chat(
         )
     )
     if total_data_sources_size == 0:
-        return direct_llm_chat(session, query, user_name)
-
-    response_id = str(uuid.uuid4())
+        return direct_llm_chat(session, response_id, query, user_name)
 
     new_chat_message: RagStudioChatMessage = _run_chat(
         session, response_id, query, query_configuration, user_name
@@ -352,9 +352,8 @@ def process_response(response: str | None) -> list[str]:
 
 
 def direct_llm_chat(
-    session: Session, query: str, user_name: Optional[str]
+    session: Session, response_id: str, query: str, user_name: Optional[str]
 ) -> RagStudioChatMessage:
-    response_id = str(uuid.uuid4())
     record_direct_llm_mlflow_run(response_id, session, user_name)
 
     chat_response = llm_completion.completion(
@@ -378,9 +377,8 @@ def direct_llm_chat(
 
 
 def stream_direct_llm_chat(
-    session: Session, query: str, user_name: Optional[str]
+    session: Session, response_id: str, query: str, user_name: Optional[str]
 ) -> Generator[ChatResponse, None, None]:
-    response_id = str(uuid.uuid4())
     record_direct_llm_mlflow_run(response_id, session, user_name)
 
     chat_response = llm_completion.stream_completion(

From cb81801db28ee3bb855d6f3292946f26bc90ab5f Mon Sep 17 00:00:00 2001
From: jwatson <jkwatson@gmail.com>
Date: Tue, 6 May 2025 11:16:46 -0700
Subject: [PATCH 14/41] wip

lastFile:ui/src/api/chatApi.ts
---
 .../app/routers/index/sessions/__init__.py    |  6 ++---
 ui/src/api/chatApi.ts                         | 27 +++++++++++++++----
 .../Loaders/PendingRagOutputSkeleton.tsx      |  2 +-
 .../FooterComponents/RagChatQueryInput.tsx    | 10 +------
 4 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/llm-service/app/routers/index/sessions/__init__.py b/llm-service/app/routers/index/sessions/__init__.py
index 485270b9..42426c24 100644
--- a/llm-service/app/routers/index/sessions/__init__.py
+++ b/llm-service/app/routers/index/sessions/__init__.py
@@ -231,10 +231,10 @@ def generate_stream():
         for response in v3_chat(
             session, request.query, configuration, user_name=origin_remote_user
         ):
-            print(response.delta)
             response_id = response.additional_kwargs["response_id"]
-            yield f"data: {response.delta}" + "\n\n"
-        yield f"data: {response_id}" + "\n\n"
+            json_delta = json.dumps({ "text": response.delta })
+            yield f"data: {json_delta}" + "\n\n"
+        yield f'data: {"response_id" : {response_id}}\n\n'
 
     # kick off evals with full response
     # todo: write to history, start evals, rewrite question, log to mlfow once the response is done
diff --git a/ui/src/api/chatApi.ts b/ui/src/api/chatApi.ts
index 4e3c1c78..e60aa4fd 100644
--- a/ui/src/api/chatApi.ts
+++ b/ui/src/api/chatApi.ts
@@ -53,10 +53,10 @@ import {
 } from "@tanstack/react-query";
 import { suggestedQuestionKey } from "src/api/ragQueryApi.ts";
 import {
+  EventSourceMessage,
   EventStreamContentType,
   fetchEventSource,
 } from "@microsoft/fetch-event-source";
-import { Dispatch, SetStateAction } from "react";
 
 export interface SourceNode {
   node_id: string;
@@ -434,7 +434,7 @@ export function useStreamChatMutation(options?: ChatMutationOptions) {
         // } else {
         //   console.log("CHUNK", chunk);
         // }
-        console.log(parsedChunk);
+        // console.log(parsedChunk);
         fullResponse += parsedChunk.text;
         options?.onChunk?.(parsedChunk.text);
       }
@@ -462,7 +462,11 @@ export const useChatMutationV2 = ({
           appendPlaceholderToChatHistory(variables.query, cachedData),
       );
     },
+    onSettled: (data, error, variables) => {
+      console.log(`onSettled is here! with response id: ${data}`);
+    },
     onSuccess: (data, variables) => {
+      console.log(`onSuccess is here! with response id: ${data}`);
       // queryClient.setQueryData<InfiniteData<ChatHistoryResponse>>(
       //   chatHistoryQueryKey({
       //     session_id: variables.request.session_id,
@@ -507,8 +511,9 @@ export const useChatMutationV2 = ({
 const chatMutationV2 = async (
   request: ChatMutationRequest,
   onChunk: (chunk: string) => void,
-): Promise<void> => {
+): Promise<string> => {
   const ctrl = new AbortController();
+  let responseId = "";
   await fetchEventSource(
     `${llmServicePath}/sessions/${request.session_id.toString()}/stream-completion`,
     {
@@ -521,8 +526,19 @@ const chatMutationV2 = async (
         configuration: request.configuration,
       }),
       signal: ctrl.signal,
-      onmessage(msg) {
-        onChunk(msg.data);
+      onmessage(msg: EventSourceMessage) {
+        const data = JSON.parse(msg.data) as {
+          text?: string;
+          response_id?: string;
+        };
+
+        console.log(`data: "${msg.data}"`);
+        if (data.text) {
+          onChunk(data.text);
+        }
+        if (data.response_id) {
+          responseId = data.response_id;
+        }
       },
       onclose() {
         console.log("Connection closed");
@@ -552,4 +568,5 @@ const chatMutationV2 = async (
       },
     },
   );
+  return responseId;
 };
diff --git a/ui/src/pages/RagChatTab/ChatOutput/Loaders/PendingRagOutputSkeleton.tsx b/ui/src/pages/RagChatTab/ChatOutput/Loaders/PendingRagOutputSkeleton.tsx
index 975dee79..9d824104 100644
--- a/ui/src/pages/RagChatTab/ChatOutput/Loaders/PendingRagOutputSkeleton.tsx
+++ b/ui/src/pages/RagChatTab/ChatOutput/Loaders/PendingRagOutputSkeleton.tsx
@@ -47,7 +47,7 @@ const PendingRagOutputSkeleton = ({ question }: { question: string }) => {
   const {
     streamedChatState: [streamedChat],
   } = useContext(RagChatContext);
-  console.log(streamedChat);
+  // console.log(streamedChat);
   return (
     <div style={{ width: "100%" }}>
       <div>
diff --git a/ui/src/pages/RagChatTab/FooterComponents/RagChatQueryInput.tsx b/ui/src/pages/RagChatTab/FooterComponents/RagChatQueryInput.tsx
index 65264e54..84719803 100644
--- a/ui/src/pages/RagChatTab/FooterComponents/RagChatQueryInput.tsx
+++ b/ui/src/pages/RagChatTab/FooterComponents/RagChatQueryInput.tsx
@@ -39,14 +39,11 @@
 import { Button, Flex, Input, InputRef, Switch, Tooltip } from "antd";
 import { DatabaseFilled, SendOutlined } from "@ant-design/icons";
 import { useContext, useEffect, useRef, useState } from "react";
-import ndjson from "ndjson";
 import { RagChatContext } from "pages/RagChatTab/State/RagChatContext.tsx";
 import {
-  ChatMutationResponse,
   createQueryConfiguration,
   useChatMutation,
   useChatMutationV2,
-  useStreamChatMutation,
 } from "src/api/chatApi.ts";
 import { useParams, useSearch } from "@tanstack/react-router";
 import { cdlBlue600 } from "src/cuix/variables.ts";
@@ -54,11 +51,6 @@ import { cdlBlue600 } from "src/cuix/variables.ts";
 import type { SwitchChangeEventHandler } from "antd/lib/switch";
 import { useSuggestQuestions } from "src/api/ragQueryApi.ts";
 import SuggestedQuestionsFooter from "pages/RagChatTab/FooterComponents/SuggestedQuestionsFooter.tsx";
-import { llmServicePath } from "src/api/utils.ts";
-import {
-  EventStreamContentType,
-  fetchEventSource,
-} from "@microsoft/fetch-event-source";
 
 const RagChatQueryInput = ({
   newSessionCallback,
@@ -99,7 +91,7 @@ const RagChatQueryInput = ({
 
   const streamChatMutation = useChatMutationV2({
     onChunk: (chunk) => {
-      console.log("chunk", chunk);
+      // console.log("chunk", chunk);
       setStreamedChat((prev) => prev + chunk);
     },
   });

From 469d14b4180d71492e0b981cf03273fac2c5ba45 Mon Sep 17 00:00:00 2001
From: Michael Liu <mliu@cloudera.com>
Date: Tue, 6 May 2025 11:28:11 -0700
Subject: [PATCH 15/41] drop databases

lastFile:ui/src/pages/RagChatTab/ChatOutput/Loaders/PendingRagOutputSkeleton.tsx
---
 ui/src/api/chatApi.ts                         |  5 ----
 .../ChatOutput/ChatMessages/ChatMessage.tsx   |  2 --
 .../Loaders/PendingRagOutputSkeleton.tsx      | 27 ++++++++++++-------
 3 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/ui/src/api/chatApi.ts b/ui/src/api/chatApi.ts
index e60aa4fd..6bc2e1b6 100644
--- a/ui/src/api/chatApi.ts
+++ b/ui/src/api/chatApi.ts
@@ -444,7 +444,6 @@ export function useStreamChatMutation(options?: ChatMutationOptions) {
 }
 
 export const useChatMutationV2 = ({
-  onSuccess,
   onError,
   onChunk,
 }: UseMutationType<ChatMessageType> & { onChunk: (msg: string) => void }) => {
@@ -462,9 +461,6 @@ export const useChatMutationV2 = ({
           appendPlaceholderToChatHistory(variables.query, cachedData),
       );
     },
-    onSettled: (data, error, variables) => {
-      console.log(`onSettled is here! with response id: ${data}`);
-    },
     onSuccess: (data, variables) => {
       console.log(`onSuccess is here! with response id: ${data}`);
       // queryClient.setQueryData<InfiniteData<ChatHistoryResponse>>(
@@ -532,7 +528,6 @@ const chatMutationV2 = async (
           response_id?: string;
         };
 
-        console.log(`data: "${msg.data}"`);
         if (data.text) {
           onChunk(data.text);
         }
diff --git a/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessage.tsx b/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessage.tsx
index 988b8b52..bf4fab20 100644
--- a/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessage.tsx
+++ b/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessage.tsx
@@ -50,8 +50,6 @@ import Markdown from "react-markdown";
 
 import "../tableMarkdown.css";
 import { ExclamationCircleTwoTone } from "@ant-design/icons";
-import { useContext } from "react";
-import { RagChatContext } from "pages/RagChatTab/State/RagChatContext.tsx";
 
 const isError = (data: ChatMessageType) => {
   return data.id.startsWith("error-");
diff --git a/ui/src/pages/RagChatTab/ChatOutput/Loaders/PendingRagOutputSkeleton.tsx b/ui/src/pages/RagChatTab/ChatOutput/Loaders/PendingRagOutputSkeleton.tsx
index 9d824104..8d7795eb 100644
--- a/ui/src/pages/RagChatTab/ChatOutput/Loaders/PendingRagOutputSkeleton.tsx
+++ b/ui/src/pages/RagChatTab/ChatOutput/Loaders/PendingRagOutputSkeleton.tsx
@@ -36,7 +36,7 @@
  * DATA.
  ******************************************************************************/
 
-import { Divider, Row, Skeleton, Typography } from "antd";
+import { Divider, Flex, Row, Skeleton, Typography } from "antd";
 import UserQuestion from "pages/RagChatTab/ChatOutput/ChatMessages/UserQuestion.tsx";
 import { useContext } from "react";
 import { RagChatContext } from "pages/RagChatTab/State/RagChatContext.tsx";
@@ -53,15 +53,22 @@ const PendingRagOutputSkeleton = ({ question }: { question: string }) => {
       <div>
         <UserQuestion question={question} />
         {streamedChat ? (
-          <Typography.Text style={{ fontSize: 16, marginTop: 8 }}>
-            <Markdown
-              skipHtml
-              remarkPlugins={[Remark]}
-              className="styled-markdown"
-            >
-              {streamedChat}
-            </Markdown>
-          </Typography.Text>
+          <Flex
+            style={{ marginTop: 15 }}
+            align="baseline"
+            justify="space-between"
+            gap={8}
+          >
+            <Typography.Text style={{ fontSize: 16, marginTop: 8 }}>
+              <Markdown
+                skipHtml
+                remarkPlugins={[Remark]}
+                className="styled-markdown"
+              >
+                {streamedChat}
+              </Markdown>
+            </Typography.Text>
+          </Flex>
         ) : (
           <Row>
             <Skeleton active />

From 9c5a00838de6d6a9cb37934952ad1813334a469b Mon Sep 17 00:00:00 2001
From: Baasit Sharief <baasitsharief@gmail.com>
Date: Tue, 6 May 2025 11:42:10 -0700
Subject: [PATCH 16/41] mob next [ci-skip] [ci skip] [skip ci]

lastFile:ui/src/pages/RagChatTab/ChatOutput/Loaders/PendingRagOutputSkeleton.tsx
---
 llm-service/app/routers/index/sessions/__init__.py |  2 +-
 .../Loaders/PendingRagOutputSkeleton.tsx           | 14 ++++++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/llm-service/app/routers/index/sessions/__init__.py b/llm-service/app/routers/index/sessions/__init__.py
index 42426c24..cf414ff3 100644
--- a/llm-service/app/routers/index/sessions/__init__.py
+++ b/llm-service/app/routers/index/sessions/__init__.py
@@ -234,7 +234,7 @@ def generate_stream():
             response_id = response.additional_kwargs["response_id"]
             json_delta = json.dumps({ "text": response.delta })
             yield f"data: {json_delta}" + "\n\n"
-        yield f'data: {"response_id" : {response_id}}\n\n'
+        yield f'data: {{"response_id" : {response_id}}}\n\n'
 
     # kick off evals with full response
     # todo: write to history, start evals, rewrite question, log to mlfow once the response is done
diff --git a/ui/src/pages/RagChatTab/ChatOutput/Loaders/PendingRagOutputSkeleton.tsx b/ui/src/pages/RagChatTab/ChatOutput/Loaders/PendingRagOutputSkeleton.tsx
index 8d7795eb..d5d8a575 100644
--- a/ui/src/pages/RagChatTab/ChatOutput/Loaders/PendingRagOutputSkeleton.tsx
+++ b/ui/src/pages/RagChatTab/ChatOutput/Loaders/PendingRagOutputSkeleton.tsx
@@ -42,6 +42,8 @@ import { useContext } from "react";
 import { RagChatContext } from "pages/RagChatTab/State/RagChatContext.tsx";
 import Markdown from "react-markdown";
 import Remark from "remark-gfm";
+import Images from "src/components/images/Images.ts";
+import { cdlBlue500 } from "src/cuix/variables.ts";
 
 const PendingRagOutputSkeleton = ({ question }: { question: string }) => {
   const {
@@ -59,6 +61,18 @@ const PendingRagOutputSkeleton = ({ question }: { question: string }) => {
             justify="space-between"
             gap={8}
           >
+            <div style={{ flex: 1 }}>
+              <Images.AiAssistantWhite
+                style={{
+                  padding: 4,
+                  backgroundColor: cdlBlue500,
+                  borderRadius: 20,
+                  width: 24,
+                  height: 24,
+                  flex: 1,
+                }}
+              />
+            </div>
             <Typography.Text style={{ fontSize: 16, marginTop: 8 }}>
               <Markdown
                 skipHtml

From 29003eedb4bce80010b52ad3045b970dcdd7cc15 Mon Sep 17 00:00:00 2001
From: Baasit Sharief <baasitsharief@gmail.com>
Date: Tue, 6 May 2025 11:44:47 -0700
Subject: [PATCH 17/41] mob next [ci-skip] [ci skip] [skip ci]

lastFile:llm-service/app/routers/index/sessions/__init__.py
---
 llm-service/app/routers/index/sessions/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm-service/app/routers/index/sessions/__init__.py b/llm-service/app/routers/index/sessions/__init__.py
index cf414ff3..10be6383 100644
--- a/llm-service/app/routers/index/sessions/__init__.py
+++ b/llm-service/app/routers/index/sessions/__init__.py
@@ -234,7 +234,7 @@ def generate_stream():
             response_id = response.additional_kwargs["response_id"]
             json_delta = json.dumps({ "text": response.delta })
             yield f"data: {json_delta}" + "\n\n"
-        yield f'data: {{"response_id" : {response_id}}}\n\n'
+        yield f'data: {{"response_id" : "{response_id}"}}\n\n'
 
     # kick off evals with full response
     # todo: write to history, start evals, rewrite question, log to mlfow once the response is done

From 603ae739befe5f34af8958d011281977246eb2e2 Mon Sep 17 00:00:00 2001
From: Elijah Williams <ewilliams@cloudera.com>
Date: Tue, 6 May 2025 12:46:07 -0600
Subject: [PATCH 18/41] small refactor

---
 ui/src/api/chatApi.ts                         | 49 +-----------
 .../ChatOutput/ChatMessages/ChatMessage.tsx   | 80 ++++++++++---------
 .../Loaders/PendingRagOutputSkeleton.tsx      | 72 ++++++-----------
 3 files changed, 67 insertions(+), 134 deletions(-)

diff --git a/ui/src/api/chatApi.ts b/ui/src/api/chatApi.ts
index 6bc2e1b6..675aa998 100644
--- a/ui/src/api/chatApi.ts
+++ b/ui/src/api/chatApi.ts
@@ -108,7 +108,7 @@ export interface ChatResponseFeedback {
   rating: boolean;
 }
 
-const placeholderChatResponseId = "placeholder";
+export const placeholderChatResponseId = "placeholder";
 
 export const isPlaceholder = (chatMessage: ChatMessageType): boolean => {
   return chatMessage.id === placeholderChatResponseId;
@@ -390,59 +390,12 @@ const feedbackMutation = async ({
   );
 };
 
-interface ChatMutationOptions {
-  onChunk?: (chunk: string) => void;
-}
-
 export interface ChatMutationResponse {
   text: string;
   response_id: string;
   done: boolean;
 }
 
-export function useStreamChatMutation(options?: ChatMutationOptions) {
-  return useMutation({
-    mutationKey: [MutationKeys.streamChatMutation],
-    mutationFn: async ({
-      query,
-      configuration,
-      session_id,
-    }: ChatMutationRequest) => {
-      const res = await fetch(
-        `${llmServicePath}/sessions/${session_id.toString()}/stream-completion`,
-        {
-          method: "POST",
-          body: JSON.stringify({ query, configuration }),
-          headers: { "Content-Type": "application/json" },
-        },
-      );
-      if (!res.body) throw new Error("Error getting stream completion");
-      const reader = res.body.getReader();
-      const decoder = new TextDecoder();
-      let fullResponse = "";
-      let done = false;
-      while (!done) {
-        const { value, done: doneReading } = await reader.read();
-        done = doneReading;
-        // do we need the fallback?
-        const chunk = decoder.decode(value ?? new Uint8Array(), {
-          stream: true,
-        });
-        const parsedChunk = JSON.parse(chunk) as ChatMutationResponse;
-        // if (doneReading) {
-        //   console.log("HELLO");
-        // } else {
-        //   console.log("CHUNK", chunk);
-        // }
-        // console.log(parsedChunk);
-        fullResponse += parsedChunk.text;
-        options?.onChunk?.(parsedChunk.text);
-      }
-      return fullResponse;
-    },
-  });
-}
-
 export const useChatMutationV2 = ({
   onError,
   onChunk,
diff --git a/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessage.tsx b/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessage.tsx
index bf4fab20..f4e5fa79 100644
--- a/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessage.tsx
+++ b/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessage.tsx
@@ -55,44 +55,7 @@ const isError = (data: ChatMessageType) => {
   return data.id.startsWith("error-");
 };
 
-const ChatMessage = ({ data }: { data: ChatMessageType }) => {
-  if (isError(data)) {
-    return (
-      <div data-testid="chat-message">
-        <div>
-          <UserQuestion question={data.rag_message.user} />
-          <Flex
-            style={{ marginTop: 15 }}
-            align="baseline"
-            justify="space-between"
-            gap={8}
-          >
-            <div style={{ flex: 1 }}>
-              <ExclamationCircleTwoTone
-                type="error"
-                twoToneColor="#ff4d4f"
-                style={{ fontSize: 22 }}
-              />
-            </div>
-            <Flex vertical gap={8} style={{ width: "100%" }}>
-              <Typography.Text style={{ fontSize: 16, marginTop: 8 }}>
-                <Alert
-                  type="error"
-                  message={data.rag_message.assistant.trimStart()}
-                />
-              </Typography.Text>
-            </Flex>
-          </Flex>
-          <Divider />
-        </div>
-      </div>
-    );
-  }
-
-  if (isPlaceholder(data)) {
-    return <PendingRagOutputSkeleton question={data.rag_message.user} />;
-  }
-
+export const ChatMessageBody = ({ data }: { data: ChatMessageType }) => {
   return (
     <div data-testid="chat-message">
       {data.rag_message.user ? (
@@ -153,4 +116,45 @@ const ChatMessage = ({ data }: { data: ChatMessageType }) => {
   );
 };
 
+const ChatMessage = ({ data }: { data: ChatMessageType }) => {
+  if (isError(data)) {
+    return (
+      <div data-testid="chat-message">
+        <div>
+          <UserQuestion question={data.rag_message.user} />
+          <Flex
+            style={{ marginTop: 15 }}
+            align="baseline"
+            justify="space-between"
+            gap={8}
+          >
+            <div style={{ flex: 1 }}>
+              <ExclamationCircleTwoTone
+                type="error"
+                twoToneColor="#ff4d4f"
+                style={{ fontSize: 22 }}
+              />
+            </div>
+            <Flex vertical gap={8} style={{ width: "100%" }}>
+              <Typography.Text style={{ fontSize: 16, marginTop: 8 }}>
+                <Alert
+                  type="error"
+                  message={data.rag_message.assistant.trimStart()}
+                />
+              </Typography.Text>
+            </Flex>
+          </Flex>
+          <Divider />
+        </div>
+      </div>
+    );
+  }
+
+  if (isPlaceholder(data)) {
+    return <PendingRagOutputSkeleton question={data.rag_message.user} />;
+  }
+
+  return <ChatMessageBody data={data} />;
+};
+
 export default ChatMessage;
diff --git a/ui/src/pages/RagChatTab/ChatOutput/Loaders/PendingRagOutputSkeleton.tsx b/ui/src/pages/RagChatTab/ChatOutput/Loaders/PendingRagOutputSkeleton.tsx
index d5d8a575..e3395d4f 100644
--- a/ui/src/pages/RagChatTab/ChatOutput/Loaders/PendingRagOutputSkeleton.tsx
+++ b/ui/src/pages/RagChatTab/ChatOutput/Loaders/PendingRagOutputSkeleton.tsx
@@ -36,61 +36,37 @@
  * DATA.
  ******************************************************************************/
 
-import { Divider, Flex, Row, Skeleton, Typography } from "antd";
-import UserQuestion from "pages/RagChatTab/ChatOutput/ChatMessages/UserQuestion.tsx";
+import { Row, Skeleton } from "antd";
 import { useContext } from "react";
 import { RagChatContext } from "pages/RagChatTab/State/RagChatContext.tsx";
-import Markdown from "react-markdown";
-import Remark from "remark-gfm";
-import Images from "src/components/images/Images.ts";
-import { cdlBlue500 } from "src/cuix/variables.ts";
+import { ChatMessageBody } from "pages/RagChatTab/ChatOutput/ChatMessages/ChatMessage.tsx";
+import { ChatMessageType, placeholderChatResponseId } from "src/api/chatApi.ts";
 
 const PendingRagOutputSkeleton = ({ question }: { question: string }) => {
   const {
     streamedChatState: [streamedChat],
   } = useContext(RagChatContext);
-  // console.log(streamedChat);
-  return (
-    <div style={{ width: "100%" }}>
-      <div>
-        <UserQuestion question={question} />
-        {streamedChat ? (
-          <Flex
-            style={{ marginTop: 15 }}
-            align="baseline"
-            justify="space-between"
-            gap={8}
-          >
-            <div style={{ flex: 1 }}>
-              <Images.AiAssistantWhite
-                style={{
-                  padding: 4,
-                  backgroundColor: cdlBlue500,
-                  borderRadius: 20,
-                  width: 24,
-                  height: 24,
-                  flex: 1,
-                }}
-              />
-            </div>
-            <Typography.Text style={{ fontSize: 16, marginTop: 8 }}>
-              <Markdown
-                skipHtml
-                remarkPlugins={[Remark]}
-                className="styled-markdown"
-              >
-                {streamedChat}
-              </Markdown>
-            </Typography.Text>
-          </Flex>
-        ) : (
-          <Row>
-            <Skeleton active />
-          </Row>
-        )}
-      </div>
-      <Divider />
-    </div>
+
+  const streamedMessage: ChatMessageType | undefined = streamedChat
+    ? {
+        id: placeholderChatResponseId,
+        session_id: 0,
+        source_nodes: [],
+        rag_message: {
+          user: question,
+          assistant: streamedChat,
+        },
+        evaluations: [],
+        timestamp: Date.now(),
+      }
+    : undefined;
+
+  return streamedMessage ? (
+    <ChatMessageBody data={streamedMessage} />
+  ) : (
+    <Row>
+      <Skeleton active />
+    </Row>
   );
 };
 

From 8984ddbe6eacef5c43612b6eb3cb577aa73120d4 Mon Sep 17 00:00:00 2001
From: Elijah Williams <ewilliams@cloudera.com>
Date: Tue, 6 May 2025 13:16:22 -0600
Subject: [PATCH 19/41] remove deps

---
 ui/package.json   |  2 --
 ui/pnpm-lock.yaml | 80 -----------------------------------------------
 2 files changed, 82 deletions(-)

diff --git a/ui/package.json b/ui/package.json
index 16dc2366..e6dead99 100644
--- a/ui/package.json
+++ b/ui/package.json
@@ -30,7 +30,6 @@
     "antd": "^5.24.6",
     "date-fns": "^4.1.0",
     "lodash": "^4.17.21",
-    "ndjson": "^2.0.0",
     "react": "^18.3.1",
     "react-dom": "^18.3.1",
     "react-intersection-observer": "^9.16.0",
@@ -50,7 +49,6 @@
     "@testing-library/user-event": "^14.5.2",
     "@types/eslint__js": "^8.42.3",
     "@types/lodash": "^4.17.13",
-    "@types/ndjson": "^2.0.4",
     "@types/node": "^22.9.0",
     "@types/react": "^18.3.12",
     "@types/react-dom": "^18.3.1",
diff --git a/ui/pnpm-lock.yaml b/ui/pnpm-lock.yaml
index a29c8bc6..92a15259 100644
--- a/ui/pnpm-lock.yaml
+++ b/ui/pnpm-lock.yaml
@@ -44,9 +44,6 @@ importers:
       lodash:
         specifier: ^4.17.21
         version: 4.17.21
-      ndjson:
-        specifier: ^2.0.0
-        version: 2.0.0
       react:
         specifier: ^18.3.1
         version: 18.3.1
@@ -99,9 +96,6 @@ importers:
       '@types/lodash':
         specifier: ^4.17.13
         version: 4.17.13
-      '@types/ndjson':
-        specifier: ^2.0.4
-        version: 2.0.4
       '@types/node':
         specifier: ^22.9.0
         version: 22.9.0
@@ -1451,9 +1445,6 @@ packages:
   '@types/ms@0.7.34':
     resolution: {integrity: sha512-nG96G3Wp6acyAgJqGasjODb+acrI7KltPiRxzHPXnP3NgI28bpQDRv53olbqGXbfcgF5aiiHmO3xpwEpS5Ld9g==}
 
-  '@types/ndjson@2.0.4':
-    resolution: {integrity: sha512-ajAl7AjhFstF6waORYNSS49GL5iBKisqJlgvXuprXFKCX9fto4ordlNU3+XMgkMddgeR0WoQQBmKUk0v0dJ4pw==}
-
   '@types/node@22.9.0':
     resolution: {integrity: sha512-vuyHg81vvWA1Z1ELfvLko2c8f34gyA0zaic0+Rllc5lbCnbSyuvb2Oxpm6TAUAC/2xZN3QGqxBNggD1nNR2AfQ==}
 
@@ -1483,9 +1474,6 @@ packages:
   '@types/swagger-ui-react@5.18.0':
     resolution: {integrity: sha512-c2M9adVG7t28t1pq19K9Jt20VLQf0P/fwJwnfcmsVVsdkwCWhRmbKDu+tIs0/NGwJ/7GY8lBx+iKZxuDI5gDbw==}
 
-  '@types/through@0.0.33':
-    resolution: {integrity: sha512-HsJ+z3QuETzP3cswwtzt2vEIiHBk/dCcHGhbmG5X3ecnwFD/lPrMpliGXxSCg03L9AhrdwA4Oz/qfspkDW+xGQ==}
-
   '@types/trusted-types@2.0.7':
     resolution: {integrity: sha512-ScaPdn1dQczgbl0QFTeTOmVHFULt394XJgOQNoyVhZ6r2vLnMLJfBPd53SB52T/3G36VI1/g2MZaX0cwDuXsfw==}
 
@@ -2681,9 +2669,6 @@ packages:
   json-stable-stringify-without-jsonify@1.0.1:
     resolution: {integrity: sha512-Bdboy+l7tA3OGW6FjyFHWkP5LuByj1Tk33Ljyq0axyzdk9//JSi2u3fP1QSmd1KNwq6VOKYGlAu87CisVir6Pw==}
 
-  json-stringify-safe@5.0.1:
-    resolution: {integrity: sha512-ZClg6AaYvamvYEE82d3Iyd3vSSIjQ+odgjaTzRuO3s7toCdFKczob2i0zCh7JE8kWn17yvAWhUVxvqGwUalsRA==}
-
   json2mq@0.2.0:
     resolution: {integrity: sha512-SzoRg7ux5DWTII9J2qkrZrqV1gt+rTaoufMxEzXbS26Uid0NwaJd123HcoB80TgubEppxxIGdNxCx50fEoEWQA==}
 
@@ -2940,9 +2925,6 @@ packages:
     resolution: {integrity: sha512-G6T0ZX48xgozx7587koeX9Ys2NYy6Gmv//P89sEte9V9whIapMNF4idKxnW2QtCcLiTWlb/wfCabAtAFWhhBow==}
     engines: {node: '>=16 || 14 >=14.17'}
 
-  minimist@1.2.8:
-    resolution: {integrity: sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==}
-
   ms@2.1.3:
     resolution: {integrity: sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==}
 
@@ -2954,11 +2936,6 @@ packages:
   natural-compare@1.4.0:
     resolution: {integrity: sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw==}
 
-  ndjson@2.0.0:
-    resolution: {integrity: sha512-nGl7LRGrzugTtaFcJMhLbpzJM6XdivmbkdlaGcrk/LXg2KL/YBC6z1g70xh0/al+oFuVFP8N8kiWRucmeEH/qQ==}
-    engines: {node: '>=10'}
-    hasBin: true
-
   neotraverse@0.6.18:
     resolution: {integrity: sha512-Z4SmBUweYa09+o6pG+eASabEpP6QkQ70yHj351pQoEXIs8uHbaU2DWVmzBANKgflPa47A50PtB2+NgRpQvr7vA==}
     engines: {node: '>= 10'}
@@ -3524,10 +3501,6 @@ packages:
     resolution: {integrity: sha512-wS+hAgJShR0KhEvPJArfuPVN1+Hz1t0Y6n5jLrGQbkb4urgPE/0Rve+1kMB1v/oWgHgm4WIcV+i7F2pTVj+2iQ==}
     engines: {node: '>=0.10.0'}
 
-  readable-stream@3.6.2:
-    resolution: {integrity: sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==}
-    engines: {node: '>= 6'}
-
   readdirp@3.6.0:
     resolution: {integrity: sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==}
     engines: {node: '>=8.10.0'}
@@ -3730,9 +3703,6 @@ packages:
   space-separated-tokens@2.0.2:
     resolution: {integrity: sha512-PEGlAwrG8yXGXRjW32fGbg66JAlOAwbObuqVoJpv/mRgoWDQfgH1wDPvtzWyUSNAXBGSk8h755YDbbcEy3SH2Q==}
 
-  split2@3.2.2:
-    resolution: {integrity: sha512-9NThjpgZnifTkJpzTZ7Eue85S49QwpNhZTq6GRJwObb6jnLFNGB7Qm73V5HewTROPyxD0C29xqmaI68bQtV+hg==}
-
   sprintf-js@1.0.3:
     resolution: {integrity: sha512-D9cPgkvLlV3t3IzL0D0YLvGA9Ahk4PcvVwUbN0dSGr1aP0Nrt4AEnTUbuGvquEC0mA64Gqt1fzirlRs5ibXx8g==}
 
@@ -3771,9 +3741,6 @@ packages:
     resolution: {integrity: sha512-UXSH262CSZY1tfu3G3Secr6uGLCFVPMhIqHjlgCUtCCcgihYc/xKs9djMTMUOb2j1mVSeU8EU6NWc/iQKU6Gfg==}
     engines: {node: '>= 0.4'}
 
-  string_decoder@1.3.0:
-    resolution: {integrity: sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==}
-
   stringify-entities@4.0.4:
     resolution: {integrity: sha512-IwfBptatlO+QCJUo19AqvrPNqlVMpW9YEL2LIVY+Rpv2qsjCGxaDLNRgeGsQWJhfItebuJhsGSLjaBbNSQ+ieg==}
 
@@ -3836,9 +3803,6 @@ packages:
     resolution: {integrity: sha512-B71/4oyj61iNH0KeCamLuE2rmKuTO5byTOSVwECM5FA7TiAiAW+UqTKZ9ERueC4qvgSttUhdmq1mXC3kJqGX7A==}
     engines: {node: '>=12.22'}
 
-  through2@4.0.2:
-    resolution: {integrity: sha512-iOqSav00cVxEEICeD7TjLB1sueEL+81Wpzp2bY17uZjZN0pWZPuo4suZ/61VujxmqSGFfgOcNuTZ85QJwNZQpw==}
-
   tiny-invariant@1.3.3:
     resolution: {integrity: sha512-+FbBPE1o9QAYvviau/qC5SE3caw21q3xkvWKBtja5vgqOWIHHJ3ioaq1VPfn/Szqctz2bU/oYeKd9/z5BL+PVg==}
 
@@ -4031,9 +3995,6 @@ packages:
     peerDependencies:
       react: ^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0
 
-  util-deprecate@1.0.2:
-    resolution: {integrity: sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==}
-
   uuid@11.0.5:
     resolution: {integrity: sha512-508e6IcKLrhxKdBbcA2b4KQZlLVp2+J5UwQ6F7Drckkc5N9ZJwFa4TgWtsww9UG8fGHbm6gbV19TdM5pQ4GaIA==}
     hasBin: true
@@ -5723,11 +5684,6 @@ snapshots:
 
   '@types/ms@0.7.34': {}
 
-  '@types/ndjson@2.0.4':
-    dependencies:
-      '@types/node': 22.9.0
-      '@types/through': 0.0.33
-
   '@types/node@22.9.0':
     dependencies:
       undici-types: 6.19.8
@@ -5759,10 +5715,6 @@ snapshots:
     dependencies:
       '@types/react': 18.3.12
 
-  '@types/through@0.0.33':
-    dependencies:
-      '@types/node': 22.9.0
-
   '@types/trusted-types@2.0.7':
     optional: true
 
@@ -7170,8 +7122,6 @@ snapshots:
 
   json-stable-stringify-without-jsonify@1.0.1: {}
 
-  json-stringify-safe@5.0.1: {}
-
   json2mq@0.2.0:
     dependencies:
       string-convert: 0.2.1
@@ -7650,22 +7600,12 @@ snapshots:
     dependencies:
       brace-expansion: 2.0.1
 
-  minimist@1.2.8: {}
-
   ms@2.1.3: {}
 
   nanoid@3.3.11: {}
 
   natural-compare@1.4.0: {}
 
-  ndjson@2.0.0:
-    dependencies:
-      json-stringify-safe: 5.0.1
-      minimist: 1.2.8
-      readable-stream: 3.6.2
-      split2: 3.2.2
-      through2: 4.0.2
-
   neotraverse@0.6.18: {}
 
   no-case@3.0.4:
@@ -8315,12 +8255,6 @@ snapshots:
     dependencies:
       loose-envify: 1.4.0
 
-  readable-stream@3.6.2:
-    dependencies:
-      inherits: 2.0.4
-      string_decoder: 1.3.0
-      util-deprecate: 1.0.2
-
   readdirp@3.6.0:
     dependencies:
       picomatch: 2.3.1
@@ -8569,10 +8503,6 @@ snapshots:
 
   space-separated-tokens@2.0.2: {}
 
-  split2@3.2.2:
-    dependencies:
-      readable-stream: 3.6.2
-
   sprintf-js@1.0.3: {}
 
   stackback@0.0.2: {}
@@ -8628,10 +8558,6 @@ snapshots:
       define-properties: 1.2.1
       es-object-atoms: 1.0.0
 
-  string_decoder@1.3.0:
-    dependencies:
-      safe-buffer: 5.2.1
-
   stringify-entities@4.0.4:
     dependencies:
       character-entities-html4: 2.1.0
@@ -8740,10 +8666,6 @@ snapshots:
 
   throttle-debounce@5.0.2: {}
 
-  through2@4.0.2:
-    dependencies:
-      readable-stream: 3.6.2
-
   tiny-invariant@1.3.3: {}
 
   tiny-warning@1.0.3: {}
@@ -8944,8 +8866,6 @@ snapshots:
     dependencies:
       react: 18.3.1
 
-  util-deprecate@1.0.2: {}
-
   uuid@11.0.5: {}
 
   vfile-message@4.0.2:

From cfae208099884f8d2ab968b84a80cc3d75482099 Mon Sep 17 00:00:00 2001
From: Elijah Williams <ewilliams@cloudera.com>
Date: Tue, 6 May 2025 14:03:18 -0600
Subject: [PATCH 20/41] things are getting close

---
 ui/src/api/chatApi.ts                         | 49 ++++++++++---------
 .../ChatMessages/ChatMessageController.tsx    | 12 +++--
 .../Loaders/PendingRagOutputSkeleton.tsx      |  4 +-
 .../FooterComponents/RagChatQueryInput.tsx    | 18 +++----
 4 files changed, 45 insertions(+), 38 deletions(-)

diff --git a/ui/src/api/chatApi.ts b/ui/src/api/chatApi.ts
index 675aa998..414fdaf0 100644
--- a/ui/src/api/chatApi.ts
+++ b/ui/src/api/chatApi.ts
@@ -57,6 +57,7 @@ import {
   EventStreamContentType,
   fetchEventSource,
 } from "@microsoft/fetch-event-source";
+import messageQueue from "src/utils/messageQueue.ts";
 
 export interface SourceNode {
   node_id: string;
@@ -393,11 +394,11 @@ const feedbackMutation = async ({
 export interface ChatMutationResponse {
   text: string;
   response_id: string;
-  done: boolean;
 }
 
 export const useChatMutationV2 = ({
   onError,
+  onSuccess,
   onChunk,
 }: UseMutationType<ChatMessageType> & { onChunk: (msg: string) => void }) => {
   const queryClient = useQueryClient();
@@ -414,20 +415,30 @@ export const useChatMutationV2 = ({
           appendPlaceholderToChatHistory(variables.query, cachedData),
       );
     },
-    onSuccess: (data, variables) => {
-      console.log(`onSuccess is here! with response id: ${data}`);
-      // queryClient.setQueryData<InfiniteData<ChatHistoryResponse>>(
-      //   chatHistoryQueryKey({
-      //     session_id: variables.request.session_id,
-      //   }),
-      //   (cachedData) => replacePlaceholderInChatHistory(data, cachedData),
-      // );
-      queryClient
-        .invalidateQueries({
-          queryKey: suggestedQuestionKey(variables.session_id),
+    onSuccess: (messageId, variables) => {
+      fetch(
+        `${llmServicePath}/sessions/${variables.session_id.toString()}/chat-history/${messageId}`,
+      )
+        .then(async (res) => {
+          const message = (await res.json()) as ChatMessageType;
+          queryClient.setQueryData<InfiniteData<ChatHistoryResponse>>(
+            chatHistoryQueryKey({
+              session_id: variables.session_id,
+            }),
+            (cachedData) =>
+              replacePlaceholderInChatHistory(message, cachedData),
+          );
+          queryClient
+            .invalidateQueries({
+              queryKey: suggestedQuestionKey(variables.session_id),
+            })
+            .catch((error: unknown) => {
+              console.error(error);
+            });
+          onSuccess?.(message);
         })
-        .catch((error: unknown) => {
-          console.error(error);
+        .catch(() => {
+          messageQueue.error("An error occurred fetching the chat message");
         });
     },
     onError: (error: Error, variables) => {
@@ -476,10 +487,7 @@ const chatMutationV2 = async (
       }),
       signal: ctrl.signal,
       onmessage(msg: EventSourceMessage) {
-        const data = JSON.parse(msg.data) as {
-          text?: string;
-          response_id?: string;
-        };
+        const data = JSON.parse(msg.data) as ChatMutationResponse;
 
         if (data.text) {
           onChunk(data.text);
@@ -488,9 +496,6 @@ const chatMutationV2 = async (
           responseId = data.response_id;
         }
       },
-      onclose() {
-        console.log("Connection closed");
-      },
       onerror(err) {
         console.error("Error", err);
         ctrl.abort();
@@ -501,8 +506,6 @@ const chatMutationV2 = async (
           response.headers.get("content-type")?.includes(EventStreamContentType)
         ) {
           await Promise.resolve();
-          console.log("all good");
-          return; // everything's good
         } else if (
           response.status >= 400 &&
           response.status < 500 &&
diff --git a/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessageController.tsx b/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessageController.tsx
index 5ddaccc9..82d47f59 100644
--- a/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessageController.tsx
+++ b/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessageController.tsx
@@ -144,7 +144,7 @@ const ChatMessageController = () => {
     ) {
       setTimeout(() => {
         if (bottomElement.current) {
-          bottomElement.current.scrollIntoView({ behavior: "auto" });
+          bottomElement.current.scrollTop = 0;
         }
       }, 50);
     }
@@ -181,18 +181,24 @@ const ChatMessageController = () => {
     <div data-testid="chat-message-controller" style={{ width: "100%" }}>
       {isFetchingPreviousPage && <Skeleton />}
       {flatChatHistory.map((historyMessage, index) => {
+        const isLast = index === flatChatHistory.length - 1;
         // trigger fetching on second to la`st item
         if (index === 2) {
           return (
             <div ref={refToFetchNextPage} key={historyMessage.id}>
+              {isLast && <div ref={bottomElement} />}
               <ChatMessage data={historyMessage} />
             </div>
           );
         }
 
-        return <ChatMessage data={historyMessage} key={historyMessage.id} />;
+        return (
+          <div key={historyMessage.id}>
+            {isLast && <div ref={bottomElement} />}
+            <ChatMessage data={historyMessage} />
+          </div>
+        );
       })}
-      <div ref={bottomElement} />
     </div>
   );
 };
diff --git a/ui/src/pages/RagChatTab/ChatOutput/Loaders/PendingRagOutputSkeleton.tsx b/ui/src/pages/RagChatTab/ChatOutput/Loaders/PendingRagOutputSkeleton.tsx
index e3395d4f..ecbb5ac3 100644
--- a/ui/src/pages/RagChatTab/ChatOutput/Loaders/PendingRagOutputSkeleton.tsx
+++ b/ui/src/pages/RagChatTab/ChatOutput/Loaders/PendingRagOutputSkeleton.tsx
@@ -62,7 +62,9 @@ const PendingRagOutputSkeleton = ({ question }: { question: string }) => {
     : undefined;
 
   return streamedMessage ? (
-    <ChatMessageBody data={streamedMessage} />
+    <div style={{ minHeight: window.innerHeight - 200 }}>
+      <ChatMessageBody data={streamedMessage} />
+    </div>
   ) : (
     <Row>
       <Skeleton active />
diff --git a/ui/src/pages/RagChatTab/FooterComponents/RagChatQueryInput.tsx b/ui/src/pages/RagChatTab/FooterComponents/RagChatQueryInput.tsx
index 84719803..e73b5481 100644
--- a/ui/src/pages/RagChatTab/FooterComponents/RagChatQueryInput.tsx
+++ b/ui/src/pages/RagChatTab/FooterComponents/RagChatQueryInput.tsx
@@ -42,7 +42,6 @@ import { useContext, useEffect, useRef, useState } from "react";
 import { RagChatContext } from "pages/RagChatTab/State/RagChatContext.tsx";
 import {
   createQueryConfiguration,
-  useChatMutation,
   useChatMutationV2,
 } from "src/api/chatApi.ts";
 import { useParams, useSearch } from "@tanstack/react-router";
@@ -83,17 +82,14 @@ const RagChatQueryInput = ({
     !search.question,
   );
 
-  const chatMutation = useChatMutation({
-    onSuccess: () => {
-      setUserInput("");
-    },
-  });
-
   const streamChatMutation = useChatMutationV2({
     onChunk: (chunk) => {
-      // console.log("chunk", chunk);
       setStreamedChat((prev) => prev + chunk);
     },
+    onSuccess: () => {
+      setUserInput("");
+      setStreamedChat("");
+    },
   });
 
   useEffect(() => {
@@ -102,7 +98,7 @@ const RagChatQueryInput = ({
     }
   }, [inputRef.current, flatChatHistory.length]);
 
-  const handleChat = async (userInput: string) => {
+  const handleChat = (userInput: string) => {
     if (userInput.trim().length <= 0) {
       return;
     }
@@ -167,7 +163,7 @@ const RagChatQueryInput = ({
                 />
               </Tooltip>
             }
-            disabled={chatMutation.isPending}
+            disabled={streamChatMutation.isPending}
           />
           <Button
             style={{ padding: 0 }}
@@ -176,7 +172,7 @@ const RagChatQueryInput = ({
               handleChat(userInput);
             }}
             icon={<SendOutlined style={{ color: cdlBlue600 }} />}
-            disabled={chatMutation.isPending}
+            disabled={streamChatMutation.isPending}
           />
         </Flex>
       </Flex>

From 0a286bc79e858359220b6b6ab74b2c20dc1e4fcc Mon Sep 17 00:00:00 2001
From: jwatson <jkwatson@gmail.com>
Date: Tue, 6 May 2025 13:14:54 -0700
Subject: [PATCH 21/41] wip

lastFile:ui/src/pages/RagChatTab/ChatOutput/Placeholders/SuggestedQuestionsCards.tsx
---
 .../ChatMessages/ChatMessageController.tsx     | 10 ++++++++--
 .../Placeholders/SuggestedQuestionsCards.tsx   | 18 ++++++++++++++----
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessageController.tsx b/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessageController.tsx
index 82d47f59..37444fe0 100644
--- a/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessageController.tsx
+++ b/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessageController.tsx
@@ -50,7 +50,7 @@ import messageQueue from "src/utils/messageQueue.ts";
 import {
   createQueryConfiguration,
   isPlaceholder,
-  useChatMutation,
+  useChatMutationV2,
 } from "src/api/chatApi.ts";
 import { useRenameNameMutation } from "src/api/sessionApi.ts";
 import NoDataSourcesState from "pages/RagChatTab/ChatOutput/Placeholders/NoDataSourcesState.tsx";
@@ -64,6 +64,7 @@ const ChatMessageController = () => {
       isFetching: isFetchingHistory,
       isFetchingPreviousPage,
     },
+    streamedChatState: [, setStreamedChat],
     activeSession,
   } = useContext(RagChatContext);
   const { ref: refToFetchNextPage, inView } = useInView({ threshold: 0 });
@@ -77,8 +78,13 @@ const ChatMessageController = () => {
       messageQueue.error(err.message);
     },
   });
-  const { mutate: chatMutation } = useChatMutation({
+
+  const { mutate: chatMutation } = useChatMutationV2({
+    onChunk: (chunk) => {
+      setStreamedChat((prev) => prev + chunk);
+    },
     onSuccess: () => {
+      setStreamedChat("");
       const url = new URL(window.location.href);
       url.searchParams.delete("question");
       window.history.pushState(null, "", url.toString());
diff --git a/ui/src/pages/RagChatTab/ChatOutput/Placeholders/SuggestedQuestionsCards.tsx b/ui/src/pages/RagChatTab/ChatOutput/Placeholders/SuggestedQuestionsCards.tsx
index 94276ff1..b678d9a4 100644
--- a/ui/src/pages/RagChatTab/ChatOutput/Placeholders/SuggestedQuestionsCards.tsx
+++ b/ui/src/pages/RagChatTab/ChatOutput/Placeholders/SuggestedQuestionsCards.tsx
@@ -40,7 +40,10 @@ import { Card, Flex, Skeleton, Typography } from "antd";
 import { RagChatContext } from "pages/RagChatTab/State/RagChatContext.tsx";
 import { useContext } from "react";
 import { useSuggestQuestions } from "src/api/ragQueryApi.ts";
-import { createQueryConfiguration, useChatMutation } from "src/api/chatApi.ts";
+import {
+  createQueryConfiguration,
+  useChatMutationV2,
+} from "src/api/chatApi.ts";
 import useCreateSessionAndRedirect from "pages/RagChatTab/ChatOutput/hooks/useCreateSessionAndRedirect";
 
 const QuestionCard = ({
@@ -77,6 +80,7 @@ const SuggestedQuestionsCards = () => {
   const {
     activeSession,
     excludeKnowledgeBaseState: [excludeKnowledgeBase],
+    streamedChatState: [, setStreamedChat],
   } = useContext(RagChatContext);
   const sessionId = activeSession?.id;
   const {
@@ -88,9 +92,15 @@ const SuggestedQuestionsCards = () => {
   });
 
   const createSessionAndRedirect = useCreateSessionAndRedirect();
-  const { mutate: chatMutation, isPending: askRagIsPending } = useChatMutation(
-    {},
-  );
+  const { mutate: chatMutation, isPending: askRagIsPending } =
+    useChatMutationV2({
+      onChunk: (chunk) => {
+        setStreamedChat((prev) => prev + chunk);
+      },
+      onSuccess: () => {
+        setStreamedChat("");
+      },
+    });
 
   const handleAskSample = (suggestedQuestion: string) => {
     if (suggestedQuestion.length > 0) {

From 6c47c6b641d8e1f73b917d4c3667e071c422a8c3 Mon Sep 17 00:00:00 2001
From: Michael Liu <mliu@cloudera.com>
Date: Tue, 6 May 2025 13:27:51 -0700
Subject: [PATCH 22/41] drop databases

lastFile:llm-service/app/services/chat.py
---
 llm-service/app/services/chat.py          | 53 ++++++++++++++++++++++
 llm-service/app/services/query/querier.py | 54 ++++++++++++++++++++++-
 2 files changed, 106 insertions(+), 1 deletion(-)

diff --git a/llm-service/app/services/chat.py b/llm-service/app/services/chat.py
index 2d5c7b62..b241eba4 100644
--- a/llm-service/app/services/chat.py
+++ b/llm-service/app/services/chat.py
@@ -143,6 +143,59 @@ def v2_chat(
     return new_chat_message
 
 
+def _run_streaming_chat(
+    session: Session,
+    response_id: str,
+    query: str,
+    query_configuration: QueryConfiguration,
+    user_name: Optional[str],
+) -> Generator[ChatResponse, None, None]:
+    if len(session.data_source_ids) != 1:
+        raise HTTPException(
+            status_code=400, detail="Only one datasource is supported for chat."
+        )
+
+    data_source_id: int = session.data_source_ids[0]
+    chat_response, condensed_question = querier.streaming_query(
+        data_source_id,
+        query,
+        query_configuration,
+        retrieve_chat_history(session.id),
+    )
+    response: ChatResponse = ChatResponse(message=ChatMessage(content=query))
+    for response in chat_response.chat_stream:
+        response.additional_kwargs["response_id"] = response_id
+        yield response
+
+    if condensed_question and (condensed_question.strip() == query.strip()):
+        condensed_question = None
+    relevance, faithfulness = evaluators.evaluate_response(
+        query, chat_response, session.inference_model
+    )
+    response_source_nodes = format_source_nodes(chat_response, data_source_id)
+    new_chat_message = RagStudioChatMessage(
+        id=response_id,
+        session_id=session.id,
+        source_nodes=response_source_nodes,
+        inference_model=session.inference_model,
+        rag_message=RagMessage(
+            user=query,
+            assistant=chat_response.response,
+        ),
+        evaluations=[
+            Evaluation(name="relevance", value=relevance),
+            Evaluation(name="faithfulness", value=faithfulness),
+        ],
+        timestamp=time.time(),
+        condensed_question=condensed_question,
+    )
+
+    record_rag_mlflow_run(
+        new_chat_message, query_configuration, response_id, session, user_name
+    )
+    return new_chat_message
+
+
 def _run_chat(
     session: Session,
     response_id: str,
diff --git a/llm-service/app/services/query/querier.py b/llm-service/app/services/query/querier.py
index 816acff5..a713e069 100644
--- a/llm-service/app/services/query/querier.py
+++ b/llm-service/app/services/query/querier.py
@@ -43,7 +43,10 @@
 from llama_index.core.base.base_retriever import BaseRetriever
 from llama_index.core.base.embeddings.base import BaseEmbedding
 from llama_index.core.base.llms.types import ChatMessage
-from llama_index.core.chat_engine.types import AgentChatResponse
+from llama_index.core.chat_engine.types import (
+    AgentChatResponse,
+    StreamingAgentChatResponse,
+)
 from llama_index.core.indices import VectorStoreIndex
 from llama_index.core.llms import LLM
 from llama_index.core.postprocessor.types import BaseNodePostprocessor
@@ -76,6 +79,55 @@
 CUSTOM_PROMPT = PromptTemplate(CUSTOM_TEMPLATE)
 
 
+def streaming_query(
+    data_source_id: int,
+    query_str: str,
+    configuration: QueryConfiguration,
+    chat_history: list[RagContext],
+) -> tuple[StreamingAgentChatResponse, str | None]:
+    qdrant_store = VectorStoreFactory.for_chunks(data_source_id)
+    vector_store = qdrant_store.llama_vector_store()
+    embedding_model = qdrant_store.get_embedding_model()
+    index = VectorStoreIndex.from_vector_store(
+        vector_store=vector_store,
+        embed_model=embedding_model,
+    )
+    logger.info("fetched Qdrant index")
+    llm = models.LLM.get(model_name=configuration.model_name)
+
+    retriever = _create_retriever(
+        configuration, embedding_model, index, data_source_id, llm
+    )
+    chat_engine = _build_flexible_chat_engine(
+        configuration, llm, retriever, data_source_id
+    )
+
+    logger.info("querying chat engine")
+    chat_messages = list(
+        map(
+            lambda message: ChatMessage(role=message.role, content=message.content),
+            chat_history,
+        )
+    )
+
+    condensed_question: str = chat_engine.condense_question(
+        chat_messages, query_str
+    ).strip()
+    try:
+        chat_response: StreamingAgentChatResponse = chat_engine.stream_chat(
+            query_str, chat_messages
+        )
+        logger.info("query response received from chat engine")
+        return chat_response, condensed_question
+    except botocore.exceptions.ClientError as error:
+        logger.warning(error.response)
+        json_error = error.response
+        raise HTTPException(
+            status_code=json_error["ResponseMetadata"]["HTTPStatusCode"],
+            detail=json_error["message"],
+        ) from error
+
+
 def query(
     data_source_id: int,
     query_str: str,

From 2cf80034e7143cbb47c34a0c3dcbc843e4bf34b5 Mon Sep 17 00:00:00 2001
From: Elijah Williams <ewilliams@cloudera.com>
Date: Tue, 6 May 2025 14:38:23 -0600
Subject: [PATCH 23/41] wip

lastFile:llm-service/app/services/chat.py
---
 llm-service/app/services/chat.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/llm-service/app/services/chat.py b/llm-service/app/services/chat.py
index b241eba4..bc82ca48 100644
--- a/llm-service/app/services/chat.py
+++ b/llm-service/app/services/chat.py
@@ -156,17 +156,22 @@ def _run_streaming_chat(
         )
 
     data_source_id: int = session.data_source_ids[0]
-    chat_response, condensed_question = querier.streaming_query(
+    streaming_chat_response, condensed_question = querier.streaming_query(
         data_source_id,
         query,
         query_configuration,
         retrieve_chat_history(session.id),
     )
-    response: ChatResponse = ChatResponse(message=ChatMessage(content=query))
-    for response in chat_response.chat_stream:
+    for response in streaming_chat_response.chat_stream:
         response.additional_kwargs["response_id"] = response_id
         yield response
 
+    chat_response = AgentChatResponse(
+        response=streaming_chat_response.response,
+        sources=streaming_chat_response.sources,
+        source_nodes=streaming_chat_response.source_nodes,
+    )
+
     if condensed_question and (condensed_question.strip() == query.strip()):
         condensed_question = None
     relevance, faithfulness = evaluators.evaluate_response(
@@ -193,7 +198,6 @@ def _run_streaming_chat(
     record_rag_mlflow_run(
         new_chat_message, query_configuration, response_id, session, user_name
     )
-    return new_chat_message
 
 
 def _run_chat(

From 5292878502b8f096d8b5a9f20a022ffb94a929bd Mon Sep 17 00:00:00 2001
From: Baasit Sharief <baasitsharief@gmail.com>
Date: Tue, 6 May 2025 13:50:42 -0700
Subject: [PATCH 24/41] mob next [ci-skip] [ci skip] [skip ci]

lastFile:llm-service/app/services/chat.py
---
 llm-service/app/services/chat.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/llm-service/app/services/chat.py b/llm-service/app/services/chat.py
index bc82ca48..2aeadaf2 100644
--- a/llm-service/app/services/chat.py
+++ b/llm-service/app/services/chat.py
@@ -97,13 +97,10 @@ def v3_chat(
     if total_data_sources_size == 0:
         return stream_direct_llm_chat(session, response_id, query, user_name)
 
-    new_chat_message: RagStudioChatMessage = _run_chat(
+    return _run_streaming_chat(
         session, response_id, query, query_configuration, user_name
     )
 
-    chat_history_manager.append_to_history(session.id, [new_chat_message])
-    return new_chat_message
-
 
 def v2_chat(
     session: Session,
@@ -195,6 +192,8 @@ def _run_streaming_chat(
         condensed_question=condensed_question,
     )
 
+    chat_history_manager.append_to_history(session.id, [new_chat_message])
+
     record_rag_mlflow_run(
         new_chat_message, query_configuration, response_id, session, user_name
     )

From c89bebdff057bc389c56eb0b7c27a5a413d16640 Mon Sep 17 00:00:00 2001
From: Michael Liu <mliu@cloudera.com>
Date: Tue, 6 May 2025 14:01:53 -0700
Subject: [PATCH 25/41] drop databases

lastFile:llm-service/app/services/chat.py
---
 llm-service/app/services/chat.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llm-service/app/services/chat.py b/llm-service/app/services/chat.py
index 2aeadaf2..8021fe23 100644
--- a/llm-service/app/services/chat.py
+++ b/llm-service/app/services/chat.py
@@ -161,6 +161,7 @@ def _run_streaming_chat(
     )
     for response in streaming_chat_response.chat_stream:
         response.additional_kwargs["response_id"] = response_id
+        print(f"{response=}")
         yield response
 
     chat_response = AgentChatResponse(
@@ -168,6 +169,7 @@ def _run_streaming_chat(
         sources=streaming_chat_response.sources,
         source_nodes=streaming_chat_response.source_nodes,
     )
+    print(f"{chat_response=}")
 
     if condensed_question and (condensed_question.strip() == query.strip()):
         condensed_question = None

From d5cca0f8c0fe0b6203152108767ded2f9a2a2fb4 Mon Sep 17 00:00:00 2001
From: Elijah Williams <ewilliams@cloudera.com>
Date: Tue, 6 May 2025 15:13:17 -0600
Subject: [PATCH 26/41] wip

lastFile:llm-service/app/services/chat.py
---
 llm-service/app/services/chat.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llm-service/app/services/chat.py b/llm-service/app/services/chat.py
index 8021fe23..e68f6da2 100644
--- a/llm-service/app/services/chat.py
+++ b/llm-service/app/services/chat.py
@@ -159,17 +159,17 @@ def _run_streaming_chat(
         query_configuration,
         retrieve_chat_history(session.id),
     )
+
+    response: ChatResponse = ChatResponse(message=ChatMessage(content=query))
     for response in streaming_chat_response.chat_stream:
         response.additional_kwargs["response_id"] = response_id
-        print(f"{response=}")
         yield response
 
     chat_response = AgentChatResponse(
-        response=streaming_chat_response.response,
+        response=response.message.content,
         sources=streaming_chat_response.sources,
         source_nodes=streaming_chat_response.source_nodes,
     )
-    print(f"{chat_response=}")
 
     if condensed_question and (condensed_question.strip() == query.strip()):
         condensed_question = None

From 476634fa2366ab89f4cac6324eeb9899accd142b Mon Sep 17 00:00:00 2001
From: Elijah Williams <ewilliams@cloudera.com>
Date: Tue, 6 May 2025 16:47:09 -0600
Subject: [PATCH 27/41] fixing scrolling

---
 .../ChatOutput/ChatMessages/ChatMessage.tsx   |  69 +----------
 .../ChatMessages/ChatMessageBody.tsx          | 109 ++++++++++++++++++
 .../ChatMessages/ChatMessageController.tsx    |  40 +++----
 .../Loaders/PendingRagOutputSkeleton.tsx      |  37 +++---
 .../ChatOutput/Sources/SourceNodes.tsx        |  23 +++-
 5 files changed, 162 insertions(+), 116 deletions(-)
 create mode 100644 ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessageBody.tsx

diff --git a/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessage.tsx b/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessage.tsx
index f4e5fa79..46b756c9 100644
--- a/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessage.tsx
+++ b/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessage.tsx
@@ -37,85 +37,18 @@
  ******************************************************************************/
 
 import { Alert, Divider, Flex, Typography } from "antd";
-import SourceNodes from "pages/RagChatTab/ChatOutput/Sources/SourceNodes.tsx";
 import PendingRagOutputSkeleton from "pages/RagChatTab/ChatOutput/Loaders/PendingRagOutputSkeleton.tsx";
 import { ChatMessageType, isPlaceholder } from "src/api/chatApi.ts";
-import { cdlBlue500, cdlGray200 } from "src/cuix/variables.ts";
 import UserQuestion from "pages/RagChatTab/ChatOutput/ChatMessages/UserQuestion.tsx";
-import { Evaluations } from "pages/RagChatTab/ChatOutput/ChatMessages/Evaluations.tsx";
-import Images from "src/components/images/Images.ts";
-import RatingFeedbackWrapper from "pages/RagChatTab/ChatOutput/ChatMessages/RatingFeedbackWrapper.tsx";
-import Remark from "remark-gfm";
-import Markdown from "react-markdown";
 
 import "../tableMarkdown.css";
 import { ExclamationCircleTwoTone } from "@ant-design/icons";
+import { ChatMessageBody } from "pages/RagChatTab/ChatOutput/ChatMessages/ChatMessageBody.tsx";
 
 const isError = (data: ChatMessageType) => {
   return data.id.startsWith("error-");
 };
 
-export const ChatMessageBody = ({ data }: { data: ChatMessageType }) => {
-  return (
-    <div data-testid="chat-message">
-      {data.rag_message.user ? (
-        <div>
-          <UserQuestion question={data.rag_message.user} />
-          <Flex
-            style={{ marginTop: 15 }}
-            align="baseline"
-            justify="space-between"
-            gap={8}
-          >
-            <div style={{ flex: 1 }}>
-              {data.source_nodes.length > 0 ? (
-                <Images.AiAssistantWhite
-                  style={{
-                    padding: 4,
-                    backgroundColor: cdlBlue500,
-                    borderRadius: 20,
-                    width: 24,
-                    height: 24,
-                    flex: 1,
-                  }}
-                />
-              ) : (
-                <Images.Models
-                  style={{
-                    padding: 4,
-                    backgroundColor: cdlGray200,
-                    borderRadius: 20,
-                    width: 26,
-                    height: 24,
-                    flex: 1,
-                  }}
-                />
-              )}
-            </div>
-            <Flex vertical gap={8} style={{ width: "100%" }}>
-              <SourceNodes data={data} />
-              <Typography.Text style={{ fontSize: 16, marginTop: 8 }}>
-                <Markdown
-                  skipHtml
-                  remarkPlugins={[Remark]}
-                  className="styled-markdown"
-                >
-                  {data.rag_message.assistant.trimStart()}
-                </Markdown>
-              </Typography.Text>
-              <Flex gap={16} align="center">
-                <Evaluations evaluations={data.evaluations} />
-                <RatingFeedbackWrapper responseId={data.id} />
-              </Flex>
-            </Flex>
-          </Flex>
-        </div>
-      ) : null}
-      <Divider />
-    </div>
-  );
-};
-
 const ChatMessage = ({ data }: { data: ChatMessageType }) => {
   if (isError(data)) {
     return (
diff --git a/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessageBody.tsx b/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessageBody.tsx
new file mode 100644
index 00000000..d64cb3bc
--- /dev/null
+++ b/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessageBody.tsx
@@ -0,0 +1,109 @@
+/*
+ * CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
+ * (C) Cloudera, Inc. 2025
+ * All rights reserved.
+ *
+ * Applicable Open Source License: Apache 2.0
+ *
+ * NOTE: Cloudera open source products are modular software products
+ * made up of hundreds of individual components, each of which was
+ * individually copyrighted.  Each Cloudera open source product is a
+ * collective work under U.S. Copyright Law. Your license to use the
+ * collective work is as provided in your written agreement with
+ * Cloudera.  Used apart from the collective work, this file is
+ * licensed for your use pursuant to the open source license
+ * identified above.
+ *
+ * This code is provided to you pursuant a written agreement with
+ * (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
+ * this code. If you do not have a written agreement with Cloudera nor
+ * with an authorized and properly licensed third party, you do not
+ * have any rights to access nor to use this code.
+ *
+ * Absent a written agreement with Cloudera, Inc. ("Cloudera") to the
+ * contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
+ * KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
+ * WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
+ * IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
+ * AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
+ * ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
+ * OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
+ * CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
+ * RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
+ * BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
+ * DATA.
+ */
+
+import { ChatMessageType } from "src/api/chatApi.ts";
+import UserQuestion from "pages/RagChatTab/ChatOutput/ChatMessages/UserQuestion.tsx";
+import { Divider, Flex, Typography } from "antd";
+import Images from "src/components/images/Images.ts";
+import { cdlBlue500, cdlGray200 } from "src/cuix/variables.ts";
+import SourceNodes from "pages/RagChatTab/ChatOutput/Sources/SourceNodes.tsx";
+import Markdown from "react-markdown";
+import Remark from "remark-gfm";
+import { Evaluations } from "pages/RagChatTab/ChatOutput/ChatMessages/Evaluations.tsx";
+import RatingFeedbackWrapper from "pages/RagChatTab/ChatOutput/ChatMessages/RatingFeedbackWrapper.tsx";
+
+export const ChatMessageBody = ({ data }: { data: ChatMessageType }) => {
+  return (
+    <div data-testid="chat-message">
+      {data.rag_message.user ? (
+        <div>
+          <UserQuestion question={data.rag_message.user} />
+          <Flex
+            style={{ marginTop: 15 }}
+            align="baseline"
+            justify="space-between"
+            gap={8}
+          >
+            <div style={{ flex: 1 }}>
+              {data.source_nodes.length > 0 ? (
+                <Images.AiAssistantWhite
+                  style={{
+                    padding: 4,
+                    backgroundColor: cdlBlue500,
+                    borderRadius: 20,
+                    width: 24,
+                    height: 24,
+                    flex: 1,
+                  }}
+                />
+              ) : (
+                <Images.Models
+                  style={{
+                    padding: 4,
+                    backgroundColor: cdlGray200,
+                    borderRadius: 20,
+                    width: 26,
+                    height: 24,
+                    flex: 1,
+                  }}
+                />
+              )}
+            </div>
+            <Flex vertical gap={8} style={{ width: "100%" }}>
+              <SourceNodes data={data} />
+              <Typography.Text style={{ fontSize: 16, marginTop: 8 }}>
+                <Markdown
+                  skipHtml
+                  remarkPlugins={[Remark]}
+                  className="styled-markdown"
+                >
+                  {data.rag_message.assistant.trimStart()}
+                </Markdown>
+              </Typography.Text>
+              <Flex gap={16} align="center">
+                <Evaluations evaluations={data.evaluations} />
+                <RatingFeedbackWrapper responseId={data.id} />
+              </Flex>
+            </Flex>
+          </Flex>
+        </div>
+      ) : null}
+      <Divider />
+    </div>
+  );
+};
diff --git a/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessageController.tsx b/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessageController.tsx
index 37444fe0..21a0ccd5 100644
--- a/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessageController.tsx
+++ b/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessageController.tsx
@@ -133,26 +133,17 @@ const ChatMessageController = () => {
   }, [fetchPreviousPage, inView]);
 
   useEffect(() => {
-    // scroll to bottom when changing the active session
     if (bottomElement.current) {
-      setTimeout(() => {
-        if (bottomElement.current) {
-          bottomElement.current.scrollIntoView({ behavior: "auto" });
-        }
-      }, 50);
-    }
-  }, [bottomElement.current, activeSession?.id]);
-
-  useEffect(() => {
-    if (
-      flatChatHistory.length > 0 &&
-      isPlaceholder(flatChatHistory[flatChatHistory.length - 1])
-    ) {
-      setTimeout(() => {
-        if (bottomElement.current) {
-          bottomElement.current.scrollTop = 0;
-        }
-      }, 50);
+      if (
+        flatChatHistory.length > 0 &&
+        isPlaceholder(flatChatHistory[flatChatHistory.length - 1])
+      ) {
+        console.log("placeholder useeffect");
+        bottomElement.current.scrollIntoView({ behavior: "smooth" });
+      } else {
+        console.log("initial useeffect");
+        bottomElement.current.scrollIntoView({ behavior: "auto" });
+      }
     }
   }, [bottomElement.current, flatChatHistory.length, activeSession?.id]);
 
@@ -197,10 +188,15 @@ const ChatMessageController = () => {
             </div>
           );
         }
-
+        if (isLast) {
+          console.log(historyMessage.id);
+        }
         return (
-          <div key={historyMessage.id}>
-            {isLast && <div ref={bottomElement} />}
+          <div
+            key={historyMessage.id}
+            ref={isLast ? bottomElement : null}
+            style={isLast ? { minHeight: window.innerHeight - 200 } : {}}
+          >
             <ChatMessage data={historyMessage} />
           </div>
         );
diff --git a/ui/src/pages/RagChatTab/ChatOutput/Loaders/PendingRagOutputSkeleton.tsx b/ui/src/pages/RagChatTab/ChatOutput/Loaders/PendingRagOutputSkeleton.tsx
index ecbb5ac3..105ae901 100644
--- a/ui/src/pages/RagChatTab/ChatOutput/Loaders/PendingRagOutputSkeleton.tsx
+++ b/ui/src/pages/RagChatTab/ChatOutput/Loaders/PendingRagOutputSkeleton.tsx
@@ -36,40 +36,29 @@
  * DATA.
  ******************************************************************************/
 
-import { Row, Skeleton } from "antd";
 import { useContext } from "react";
 import { RagChatContext } from "pages/RagChatTab/State/RagChatContext.tsx";
-import { ChatMessageBody } from "pages/RagChatTab/ChatOutput/ChatMessages/ChatMessage.tsx";
 import { ChatMessageType, placeholderChatResponseId } from "src/api/chatApi.ts";
+import { ChatMessageBody } from "pages/RagChatTab/ChatOutput/ChatMessages/ChatMessageBody.tsx";
 
 const PendingRagOutputSkeleton = ({ question }: { question: string }) => {
   const {
     streamedChatState: [streamedChat],
   } = useContext(RagChatContext);
 
-  const streamedMessage: ChatMessageType | undefined = streamedChat
-    ? {
-        id: placeholderChatResponseId,
-        session_id: 0,
-        source_nodes: [],
-        rag_message: {
-          user: question,
-          assistant: streamedChat,
-        },
-        evaluations: [],
-        timestamp: Date.now(),
-      }
-    : undefined;
+  const streamedMessage: ChatMessageType = {
+    id: placeholderChatResponseId,
+    session_id: 0,
+    source_nodes: [],
+    rag_message: {
+      user: question,
+      assistant: streamedChat,
+    },
+    evaluations: [],
+    timestamp: Date.now(),
+  };
 
-  return streamedMessage ? (
-    <div style={{ minHeight: window.innerHeight - 200 }}>
-      <ChatMessageBody data={streamedMessage} />
-    </div>
-  ) : (
-    <Row>
-      <Skeleton active />
-    </Row>
-  );
+  return <ChatMessageBody data={streamedMessage} />;
 };
 
 export default PendingRagOutputSkeleton;
diff --git a/ui/src/pages/RagChatTab/ChatOutput/Sources/SourceNodes.tsx b/ui/src/pages/RagChatTab/ChatOutput/Sources/SourceNodes.tsx
index 02e1799c..51467897 100644
--- a/ui/src/pages/RagChatTab/ChatOutput/Sources/SourceNodes.tsx
+++ b/ui/src/pages/RagChatTab/ChatOutput/Sources/SourceNodes.tsx
@@ -36,15 +36,23 @@
  * DATA.
  ******************************************************************************/
 
-import { Flex, Typography } from "antd";
+import { Flex, Skeleton, Typography } from "antd";
 import { SourceCard } from "pages/RagChatTab/ChatOutput/Sources/SourceCard.tsx";
-import { ChatMessageType } from "src/api/chatApi.ts";
+import { ChatMessageType, isPlaceholder } from "src/api/chatApi.ts";
 import { WarningTwoTone } from "@ant-design/icons";
 import { cdlOrange050, cdlOrange500 } from "src/cuix/variables.ts";
 import { useGetModelById } from "src/api/modelsApi.ts";
 import { useContext } from "react";
 import { RagChatContext } from "pages/RagChatTab/State/RagChatContext.tsx";
 
+const SkeletonNode = () => {
+  return (
+    <Skeleton.Node
+      active
+      style={{ width: 180, borderRadius: 20, height: 24 }}
+    />
+  );
+};
 const SourceNodes = ({ data }: { data: ChatMessageType }) => {
   const { data: inferenceModel } = useGetModelById(data.inference_model);
   const { activeSession } = useContext(RagChatContext);
@@ -53,6 +61,17 @@ const SourceNodes = ({ data }: { data: ChatMessageType }) => {
     <SourceCard key={node.node_id} source={node} />
   ));
 
+  if (isPlaceholder(data)) {
+    return (
+      <Flex style={{ gap: 8 }}>
+        <SkeletonNode />
+        <SkeletonNode />
+        <SkeletonNode />
+        <SkeletonNode />
+      </Flex>
+    );
+  }
+
   if (
     nodes.length === 0 &&
     activeSession &&

From 8c8f1a82230283c31d4250a225f2dcb90e4407e5 Mon Sep 17 00:00:00 2001
From: Elijah Williams <ewilliams@cloudera.com>
Date: Tue, 6 May 2025 16:55:12 -0600
Subject: [PATCH 28/41] only show loading nodes if kb

---
 .../ChatOutput/ChatMessages/ChatMessageController.tsx       | 2 --
 ui/src/pages/RagChatTab/ChatOutput/Sources/SourceNodes.tsx  | 6 +++++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessageController.tsx b/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessageController.tsx
index 21a0ccd5..fb1c5a29 100644
--- a/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessageController.tsx
+++ b/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessageController.tsx
@@ -138,10 +138,8 @@ const ChatMessageController = () => {
         flatChatHistory.length > 0 &&
         isPlaceholder(flatChatHistory[flatChatHistory.length - 1])
       ) {
-        console.log("placeholder useeffect");
         bottomElement.current.scrollIntoView({ behavior: "smooth" });
       } else {
-        console.log("initial useeffect");
         bottomElement.current.scrollIntoView({ behavior: "auto" });
       }
     }
diff --git a/ui/src/pages/RagChatTab/ChatOutput/Sources/SourceNodes.tsx b/ui/src/pages/RagChatTab/ChatOutput/Sources/SourceNodes.tsx
index 51467897..8f355a90 100644
--- a/ui/src/pages/RagChatTab/ChatOutput/Sources/SourceNodes.tsx
+++ b/ui/src/pages/RagChatTab/ChatOutput/Sources/SourceNodes.tsx
@@ -61,7 +61,11 @@ const SourceNodes = ({ data }: { data: ChatMessageType }) => {
     <SourceCard key={node.node_id} source={node} />
   ));
 
-  if (isPlaceholder(data)) {
+  if (
+    isPlaceholder(data) &&
+    activeSession &&
+    activeSession.dataSourceIds.length > 0
+  ) {
     return (
       <Flex style={{ gap: 8 }}>
         <SkeletonNode />

From a06c03068f52255590f3eb72044f460bc65de734 Mon Sep 17 00:00:00 2001
From: Elijah Williams <ewilliams@cloudera.com>
Date: Tue, 6 May 2025 16:56:42 -0600
Subject: [PATCH 29/41] remove unused

---
 ui/src/api/chatApi.ts | 69 -------------------------------------------
 1 file changed, 69 deletions(-)

diff --git a/ui/src/api/chatApi.ts b/ui/src/api/chatApi.ts
index 414fdaf0..1a8c2c67 100644
--- a/ui/src/api/chatApi.ts
+++ b/ui/src/api/chatApi.ts
@@ -259,75 +259,6 @@ export const replacePlaceholderInChatHistory = (
   };
 };
 
-export const useChatMutation = ({
-  onSuccess,
-  onError,
-}: UseMutationType<ChatMessageType>) => {
-  const queryClient = useQueryClient();
-  return useMutation({
-    mutationKey: [MutationKeys.chatMutation],
-    mutationFn: chatMutation,
-    onMutate: (variables) => {
-      queryClient.setQueryData<InfiniteData<ChatHistoryResponse>>(
-        chatHistoryQueryKey({
-          session_id: variables.session_id,
-        }),
-        (cachedData) =>
-          appendPlaceholderToChatHistory(variables.query, cachedData),
-      );
-    },
-    onSuccess: (data, variables) => {
-      queryClient.setQueryData<InfiniteData<ChatHistoryResponse>>(
-        chatHistoryQueryKey({
-          session_id: variables.session_id,
-        }),
-        (cachedData) => replacePlaceholderInChatHistory(data, cachedData),
-      );
-      queryClient
-        .invalidateQueries({
-          queryKey: suggestedQuestionKey(variables.session_id),
-        })
-        .catch((error: unknown) => {
-          console.error(error);
-        });
-      onSuccess?.(data);
-    },
-    onError: (error: Error, variables) => {
-      const uuid = crypto.randomUUID();
-      const errorMessage: ChatMessageType = {
-        id: `error-${uuid}`,
-        session_id: variables.session_id,
-        source_nodes: [],
-        rag_message: {
-          user: variables.query,
-          assistant: error.message,
-        },
-        evaluations: [],
-        timestamp: Date.now(),
-      };
-      queryClient.setQueryData<InfiniteData<ChatHistoryResponse>>(
-        chatHistoryQueryKey({
-          session_id: variables.session_id,
-          offset: 0,
-        }),
-        (cachedData) =>
-          replacePlaceholderInChatHistory(errorMessage, cachedData),
-      );
-
-      onError?.(error);
-    },
-  });
-};
-
-const chatMutation = async (
-  request: ChatMutationRequest,
-): Promise<ChatMessageType> => {
-  return await postRequest(
-    `${llmServicePath}/sessions/${request.session_id.toString()}/chat`,
-    request,
-  );
-};
-
 export const createQueryConfiguration = (
   excludeKnowledgeBase: boolean,
 ): QueryConfiguration => {

From 4e0fc1f7603bd81e5efc4d37db312e4f5e63d153 Mon Sep 17 00:00:00 2001
From: Elijah Williams <ewilliams@cloudera.com>
Date: Tue, 6 May 2025 17:01:40 -0600
Subject: [PATCH 30/41] removing active loading state

---
 ui/src/pages/RagChatTab/ChatOutput/Sources/SourceNodes.tsx | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/ui/src/pages/RagChatTab/ChatOutput/Sources/SourceNodes.tsx b/ui/src/pages/RagChatTab/ChatOutput/Sources/SourceNodes.tsx
index 8f355a90..92eff03f 100644
--- a/ui/src/pages/RagChatTab/ChatOutput/Sources/SourceNodes.tsx
+++ b/ui/src/pages/RagChatTab/ChatOutput/Sources/SourceNodes.tsx
@@ -46,12 +46,7 @@ import { useContext } from "react";
 import { RagChatContext } from "pages/RagChatTab/State/RagChatContext.tsx";
 
 const SkeletonNode = () => {
-  return (
-    <Skeleton.Node
-      active
-      style={{ width: 180, borderRadius: 20, height: 24 }}
-    />
-  );
+  return <Skeleton.Node style={{ width: 180, borderRadius: 20, height: 24 }} />;
 };
 const SourceNodes = ({ data }: { data: ChatMessageType }) => {
   const { data: inferenceModel } = useGetModelById(data.inference_model);

From 9b4087b5de38d974b527e4293036ee84f6f69069 Mon Sep 17 00:00:00 2001
From: jwatson <jkwatson@gmail.com>
Date: Tue, 6 May 2025 16:09:09 -0700
Subject: [PATCH 31/41] fix mypy issues

---
 llm-service/app/routers/index/sessions/__init__.py |  4 ++--
 llm-service/app/services/chat.py                   | 14 ++++++++------
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/llm-service/app/routers/index/sessions/__init__.py b/llm-service/app/routers/index/sessions/__init__.py
index 10be6383..9712b6ff 100644
--- a/llm-service/app/routers/index/sessions/__init__.py
+++ b/llm-service/app/routers/index/sessions/__init__.py
@@ -38,7 +38,7 @@
 import base64
 import json
 import logging
-from typing import Optional
+from typing import Optional, Generator
 
 from fastapi import APIRouter, Header, HTTPException
 from fastapi.responses import StreamingResponse
@@ -226,7 +226,7 @@ def stream_chat_completion(
     session = session_metadata_api.get_session(session_id, user_name=origin_remote_user)
     configuration = request.configuration or RagPredictConfiguration()
 
-    def generate_stream():
+    def generate_stream() -> Generator[str, None, None]:
         response_id: str = ""
         for response in v3_chat(
             session, request.query, configuration, user_name=origin_remote_user
diff --git a/llm-service/app/services/chat.py b/llm-service/app/services/chat.py
index e68f6da2..059365bb 100644
--- a/llm-service/app/services/chat.py
+++ b/llm-service/app/services/chat.py
@@ -1,4 +1,4 @@
-# ##############################################################################
+# ##########################################################################
 #  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
 #  (C) Cloudera, Inc. 2024
 #  All rights reserved.
@@ -161,12 +161,13 @@ def _run_streaming_chat(
     )
 
     response: ChatResponse = ChatResponse(message=ChatMessage(content=query))
-    for response in streaming_chat_response.chat_stream:
-        response.additional_kwargs["response_id"] = response_id
-        yield response
+    if streaming_chat_response.chat_stream:
+        for response in streaming_chat_response.chat_stream:
+            response.additional_kwargs["response_id"] = response_id
+            yield response
 
     chat_response = AgentChatResponse(
-        response=response.message.content,
+        response=response.message.content or "",
         sources=streaming_chat_response.sources,
         source_nodes=streaming_chat_response.source_nodes,
     )
@@ -201,6 +202,7 @@ def _run_streaming_chat(
     )
 
 
+
 def _run_chat(
     session: Session,
     response_id: str,
@@ -455,7 +457,7 @@ def stream_direct_llm_chat(
         evaluations=[],
         rag_message=RagMessage(
             user=query,
-            assistant=response.message.content,
+            assistant=response.message.content or "",
         ),
         timestamp=time.time(),
         condensed_question=None,

From a66d48961581aedbaf914dd931ef93d11f83808e Mon Sep 17 00:00:00 2001
From: jwatson <jkwatson@gmail.com>
Date: Tue, 6 May 2025 16:12:34 -0700
Subject: [PATCH 32/41] ruff

---
 llm-service/app/services/llm_completion.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llm-service/app/services/llm_completion.py b/llm-service/app/services/llm_completion.py
index b532e3be..922ea586 100644
--- a/llm-service/app/services/llm_completion.py
+++ b/llm-service/app/services/llm_completion.py
@@ -41,7 +41,6 @@
 from llama_index.core.base.llms.types import (
     ChatMessage,
     ChatResponse,
-    ChatResponseGen,
 )
 from llama_index.core.llms import LLM
 

From feacd5af5b828d4d9b38ef42d70f38ac5bb6172e Mon Sep 17 00:00:00 2001
From: actions-user <actions@github.com>
Date: Tue, 6 May 2025 23:16:54 +0000
Subject: [PATCH 33/41] Update release version to dev-testing

---
 scripts/release_version.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/release_version.txt b/scripts/release_version.txt
index 114c45f6..32150ea1 100644
--- a/scripts/release_version.txt
+++ b/scripts/release_version.txt
@@ -1 +1 @@
-export RELEASE_TAG=1.17.0
+export RELEASE_TAG=dev-testing

From 87fd117217f5d8302a229fa4ef457a961d37be8b Mon Sep 17 00:00:00 2001
From: Elijah Williams <ewilliams@cloudera.com>
Date: Tue, 6 May 2025 17:26:07 -0600
Subject: [PATCH 34/41] handle file not found error for summaries when local

---
 .../app/ai/indexing/summary_indexer.py        | 24 ++++++++++++-------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/llm-service/app/ai/indexing/summary_indexer.py b/llm-service/app/ai/indexing/summary_indexer.py
index 9189f504..7d1290df 100644
--- a/llm-service/app/ai/indexing/summary_indexer.py
+++ b/llm-service/app/ai/indexing/summary_indexer.py
@@ -218,13 +218,17 @@ def create_storage_context(
     @classmethod
     def get_all_data_source_summaries(cls) -> dict[str, str]:
         root_dir = cls.__persist_root_dir()
-        # if not os.path.exists(root_dir):
-        #     return {}
-        storage_context = SummaryIndexer.create_storage_context(
-            persist_dir=root_dir,
-            vector_store=SimpleVectorStore(),
-        )
-        indices = load_indices_from_storage(storage_context=storage_context, index_ids=None,
+        try:
+            storage_context = SummaryIndexer.create_storage_context(
+                persist_dir=root_dir,
+                vector_store=SimpleVectorStore(),
+            )
+        except FileNotFoundError:
+            # If the directory doesn't exist, we don't have any summaries.
+            return {}
+        indices = load_indices_from_storage(
+            storage_context=storage_context,
+            index_ids=None,
             **{
                 "llm": models.LLM.get_noop(),
                 "response_synthesizer": models.LLM.get_noop(),
@@ -234,11 +238,13 @@ def get_all_data_source_summaries(cls) -> dict[str, str]:
                 "summary_query": "None",
                 "data_source_id": 0,
             },
-                                            )
+        )
         if len(indices) == 0:
             return {}
 
-        global_summary_store: DocumentSummaryIndex = cast(DocumentSummaryIndex, indices[0])
+        global_summary_store: DocumentSummaryIndex = cast(
+            DocumentSummaryIndex, indices[0]
+        )
 
         summary_ids = global_summary_store.index_struct.doc_id_to_summary_id.values()
         nodes = global_summary_store.docstore.get_nodes(list(summary_ids))

From 3cecc224a3b54d757d98f376e7b77bb213860640 Mon Sep 17 00:00:00 2001
From: Elijah Williams <ewilliams@cloudera.com>
Date: Tue, 6 May 2025 17:27:07 -0600
Subject: [PATCH 35/41] remove log

---
 .../ChatOutput/ChatMessages/ChatMessageController.tsx       | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessageController.tsx b/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessageController.tsx
index fb1c5a29..9ea76106 100644
--- a/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessageController.tsx
+++ b/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessageController.tsx
@@ -177,7 +177,7 @@ const ChatMessageController = () => {
       {isFetchingPreviousPage && <Skeleton />}
       {flatChatHistory.map((historyMessage, index) => {
         const isLast = index === flatChatHistory.length - 1;
-        // trigger fetching on second to la`st item
+        // trigger fetching on second to last item
         if (index === 2) {
           return (
             <div ref={refToFetchNextPage} key={historyMessage.id}>
@@ -186,9 +186,7 @@ const ChatMessageController = () => {
             </div>
           );
         }
-        if (isLast) {
-          console.log(historyMessage.id);
-        }
+
         return (
           <div
             key={historyMessage.id}

From 71f5d6cda8bfe8e86c14bd29522d5960f087082d Mon Sep 17 00:00:00 2001
From: Elijah Williams <ewilliams@cloudera.com>
Date: Wed, 7 May 2025 08:52:38 -0600
Subject: [PATCH 36/41] renaming

---
 ui/src/api/chatApi.ts                                       | 6 +++---
 .../ChatOutput/ChatMessages/ChatMessageController.tsx       | 4 ++--
 .../ChatOutput/Placeholders/SuggestedQuestionsCards.tsx     | 4 ++--
 .../pages/RagChatTab/FooterComponents/RagChatQueryInput.tsx | 4 ++--
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/ui/src/api/chatApi.ts b/ui/src/api/chatApi.ts
index 1a8c2c67..83b4a461 100644
--- a/ui/src/api/chatApi.ts
+++ b/ui/src/api/chatApi.ts
@@ -327,7 +327,7 @@ export interface ChatMutationResponse {
   response_id: string;
 }
 
-export const useChatMutationV2 = ({
+export const useStreamingChatMutation = ({
   onError,
   onSuccess,
   onChunk,
@@ -336,7 +336,7 @@ export const useChatMutationV2 = ({
   return useMutation({
     mutationKey: [MutationKeys.chatMutation],
     mutationFn: (request: ChatMutationRequest) =>
-      chatMutationV2(request, onChunk),
+      streamChatMutation(request, onChunk),
     onMutate: (variables) => {
       queryClient.setQueryData<InfiniteData<ChatHistoryResponse>>(
         chatHistoryQueryKey({
@@ -399,7 +399,7 @@ export const useChatMutationV2 = ({
   });
 };
 
-const chatMutationV2 = async (
+const streamChatMutation = async (
   request: ChatMutationRequest,
   onChunk: (chunk: string) => void,
 ): Promise<string> => {
diff --git a/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessageController.tsx b/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessageController.tsx
index 9ea76106..7797c545 100644
--- a/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessageController.tsx
+++ b/ui/src/pages/RagChatTab/ChatOutput/ChatMessages/ChatMessageController.tsx
@@ -50,7 +50,7 @@ import messageQueue from "src/utils/messageQueue.ts";
 import {
   createQueryConfiguration,
   isPlaceholder,
-  useChatMutationV2,
+  useStreamingChatMutation,
 } from "src/api/chatApi.ts";
 import { useRenameNameMutation } from "src/api/sessionApi.ts";
 import NoDataSourcesState from "pages/RagChatTab/ChatOutput/Placeholders/NoDataSourcesState.tsx";
@@ -79,7 +79,7 @@ const ChatMessageController = () => {
     },
   });
 
-  const { mutate: chatMutation } = useChatMutationV2({
+  const { mutate: chatMutation } = useStreamingChatMutation({
     onChunk: (chunk) => {
       setStreamedChat((prev) => prev + chunk);
     },
diff --git a/ui/src/pages/RagChatTab/ChatOutput/Placeholders/SuggestedQuestionsCards.tsx b/ui/src/pages/RagChatTab/ChatOutput/Placeholders/SuggestedQuestionsCards.tsx
index b678d9a4..84694a3d 100644
--- a/ui/src/pages/RagChatTab/ChatOutput/Placeholders/SuggestedQuestionsCards.tsx
+++ b/ui/src/pages/RagChatTab/ChatOutput/Placeholders/SuggestedQuestionsCards.tsx
@@ -42,7 +42,7 @@ import { useContext } from "react";
 import { useSuggestQuestions } from "src/api/ragQueryApi.ts";
 import {
   createQueryConfiguration,
-  useChatMutationV2,
+  useStreamingChatMutation,
 } from "src/api/chatApi.ts";
 import useCreateSessionAndRedirect from "pages/RagChatTab/ChatOutput/hooks/useCreateSessionAndRedirect";
 
@@ -93,7 +93,7 @@ const SuggestedQuestionsCards = () => {
 
   const createSessionAndRedirect = useCreateSessionAndRedirect();
   const { mutate: chatMutation, isPending: askRagIsPending } =
-    useChatMutationV2({
+    useStreamingChatMutation({
       onChunk: (chunk) => {
         setStreamedChat((prev) => prev + chunk);
       },
diff --git a/ui/src/pages/RagChatTab/FooterComponents/RagChatQueryInput.tsx b/ui/src/pages/RagChatTab/FooterComponents/RagChatQueryInput.tsx
index e73b5481..2d51f4d8 100644
--- a/ui/src/pages/RagChatTab/FooterComponents/RagChatQueryInput.tsx
+++ b/ui/src/pages/RagChatTab/FooterComponents/RagChatQueryInput.tsx
@@ -42,7 +42,7 @@ import { useContext, useEffect, useRef, useState } from "react";
 import { RagChatContext } from "pages/RagChatTab/State/RagChatContext.tsx";
 import {
   createQueryConfiguration,
-  useChatMutationV2,
+  useStreamingChatMutation,
 } from "src/api/chatApi.ts";
 import { useParams, useSearch } from "@tanstack/react-router";
 import { cdlBlue600 } from "src/cuix/variables.ts";
@@ -82,7 +82,7 @@ const RagChatQueryInput = ({
     !search.question,
   );
 
-  const streamChatMutation = useChatMutationV2({
+  const streamChatMutation = useStreamingChatMutation({
     onChunk: (chunk) => {
       setStreamedChat((prev) => prev + chunk);
     },

From 6612fa5a97557836a6a77fb8c2cd77fe1fe7a344 Mon Sep 17 00:00:00 2001
From: Elijah Williams <ewilliams@cloudera.com>
Date: Wed, 7 May 2025 10:34:57 -0600
Subject: [PATCH 37/41] better error handling

---
 .../app/routers/index/sessions/__init__.py    | 19 ++--
 llm-service/app/services/chat.py              |  1 -
 ui/src/api/chatApi.ts                         | 87 ++++++++++++-------
 ui/src/api/modelsApi.ts                       |  4 +-
 4 files changed, 67 insertions(+), 44 deletions(-)

diff --git a/llm-service/app/routers/index/sessions/__init__.py b/llm-service/app/routers/index/sessions/__init__.py
index 9712b6ff..d4521dbe 100644
--- a/llm-service/app/routers/index/sessions/__init__.py
+++ b/llm-service/app/routers/index/sessions/__init__.py
@@ -228,13 +228,18 @@ def stream_chat_completion(
 
     def generate_stream() -> Generator[str, None, None]:
         response_id: str = ""
-        for response in v3_chat(
-            session, request.query, configuration, user_name=origin_remote_user
-        ):
-            response_id = response.additional_kwargs["response_id"]
-            json_delta = json.dumps({ "text": response.delta })
-            yield f"data: {json_delta}" + "\n\n"
-        yield f'data: {{"response_id" : "{response_id}"}}\n\n'
+        try:
+            for response in v3_chat(
+                session, request.query, configuration, user_name=origin_remote_user
+            ):
+                print(response)
+                response_id = response.additional_kwargs["response_id"]
+                json_delta = json.dumps({"text": response.delta})
+                yield f"data: {json_delta}" + "\n\n"
+            yield f'data: {{"response_id" : "{response_id}"}}\n\n'
+        except Exception as e:
+            logger.exception("Failed to stream chat completion")
+            yield f'data: {{"error" : "{e}"}}\n\n'
 
     # kick off evals with full response
     # todo: write to history, start evals, rewrite question, log to mlfow once the response is done
diff --git a/llm-service/app/services/chat.py b/llm-service/app/services/chat.py
index 059365bb..1744ea3a 100644
--- a/llm-service/app/services/chat.py
+++ b/llm-service/app/services/chat.py
@@ -202,7 +202,6 @@ def _run_streaming_chat(
     )
 
 
-
 def _run_chat(
     session: Session,
     response_id: str,
diff --git a/ui/src/api/chatApi.ts b/ui/src/api/chatApi.ts
index 83b4a461..50938b02 100644
--- a/ui/src/api/chatApi.ts
+++ b/ui/src/api/chatApi.ts
@@ -57,7 +57,6 @@ import {
   EventStreamContentType,
   fetchEventSource,
 } from "@microsoft/fetch-event-source";
-import messageQueue from "src/utils/messageQueue.ts";
 
 export interface SourceNode {
   node_id: string;
@@ -323,20 +322,53 @@ const feedbackMutation = async ({
 };
 
 export interface ChatMutationResponse {
-  text: string;
-  response_id: string;
+  text?: string;
+  response_id?: string;
+  error?: string;
 }
 
+const errorChatMessage = (variables: ChatMutationRequest, error: Error) => {
+  const uuid = crypto.randomUUID();
+  const errorMessage: ChatMessageType = {
+    id: `error-${uuid}`,
+    session_id: variables.session_id,
+    source_nodes: [],
+    rag_message: {
+      user: variables.query,
+      assistant: error.message,
+    },
+    evaluations: [],
+    timestamp: Date.now(),
+  };
+  return errorMessage;
+};
+
 export const useStreamingChatMutation = ({
   onError,
   onSuccess,
   onChunk,
 }: UseMutationType<ChatMessageType> & { onChunk: (msg: string) => void }) => {
   const queryClient = useQueryClient();
+  const handleError = (variables: ChatMutationRequest, error: Error) => {
+    const errorMessage = errorChatMessage(variables, error);
+    queryClient.setQueryData<InfiniteData<ChatHistoryResponse>>(
+      chatHistoryQueryKey({
+        session_id: variables.session_id,
+        offset: 0,
+      }),
+      (cachedData) => replacePlaceholderInChatHistory(errorMessage, cachedData),
+    );
+  };
   return useMutation({
     mutationKey: [MutationKeys.chatMutation],
-    mutationFn: (request: ChatMutationRequest) =>
-      streamChatMutation(request, onChunk),
+    mutationFn: (request: ChatMutationRequest) => {
+      const convertError = (errorMessage: string) => {
+        const error = new Error(errorMessage);
+        handleError(request, error);
+        onError?.(error);
+      };
+      return streamChatMutation(request, onChunk, convertError);
+    },
     onMutate: (variables) => {
       queryClient.setQueryData<InfiniteData<ChatHistoryResponse>>(
         chatHistoryQueryKey({
@@ -347,6 +379,9 @@ export const useStreamingChatMutation = ({
       );
     },
     onSuccess: (messageId, variables) => {
+      if (!messageId) {
+        return;
+      }
       fetch(
         `${llmServicePath}/sessions/${variables.session_id.toString()}/chat-history/${messageId}`,
       )
@@ -368,32 +403,13 @@ export const useStreamingChatMutation = ({
             });
           onSuccess?.(message);
         })
-        .catch(() => {
-          messageQueue.error("An error occurred fetching the chat message");
+        .catch((error: unknown) => {
+          handleError(variables, error as Error);
+          onError?.(error as Error);
         });
     },
     onError: (error: Error, variables) => {
-      const uuid = crypto.randomUUID();
-      const errorMessage: ChatMessageType = {
-        id: `error-${uuid}`,
-        session_id: variables.session_id,
-        source_nodes: [],
-        rag_message: {
-          user: variables.query,
-          assistant: error.message,
-        },
-        evaluations: [],
-        timestamp: Date.now(),
-      };
-      queryClient.setQueryData<InfiniteData<ChatHistoryResponse>>(
-        chatHistoryQueryKey({
-          session_id: variables.session_id,
-          offset: 0,
-        }),
-        (cachedData) =>
-          replacePlaceholderInChatHistory(errorMessage, cachedData),
-      );
-
+      handleError(variables, error);
       onError?.(error);
     },
   });
@@ -402,6 +418,7 @@ export const useStreamingChatMutation = ({
 const streamChatMutation = async (
   request: ChatMutationRequest,
   onChunk: (chunk: string) => void,
+  onError: (error: string) => void,
 ): Promise<string> => {
   const ctrl = new AbortController();
   let responseId = "";
@@ -420,6 +437,11 @@ const streamChatMutation = async (
       onmessage(msg: EventSourceMessage) {
         const data = JSON.parse(msg.data) as ChatMutationResponse;
 
+        if (data.error) {
+          ctrl.abort();
+          onError(data.error);
+        }
+
         if (data.text) {
           onChunk(data.text);
         }
@@ -427,9 +449,9 @@ const streamChatMutation = async (
           responseId = data.response_id;
         }
       },
-      onerror(err) {
-        console.error("Error", err);
+      onerror(err: unknown) {
         ctrl.abort();
+        onError(String(err));
       },
       async onopen(response) {
         if (
@@ -442,10 +464,9 @@ const streamChatMutation = async (
           response.status < 500 &&
           response.status !== 429
         ) {
-          // client-side errors are usually non-retriable:
-          throw new Error();
+          onError("An error occurred: " + response.statusText);
         } else {
-          throw new Error();
+          onError("An error occurred: " + response.statusText);
         }
       },
     },
diff --git a/ui/src/api/modelsApi.ts b/ui/src/api/modelsApi.ts
index e36c8f8c..70759e4f 100644
--- a/ui/src/api/modelsApi.ts
+++ b/ui/src/api/modelsApi.ts
@@ -57,13 +57,11 @@ export const useGetModelById = (model_id?: string) => {
   return useQuery({
     queryKey: [QueryKeys.getModelById, { model_id }],
     queryFn: async () => {
-      if (!model_id) {
-        return undefined;
-      }
       const llmModels = await getLlmModels();
       return llmModels.find((model) => model.model_id === model_id);
     },
     staleTime: 1000 * 60 * 60,
+    enabled: !!model_id,
   });
 };
 

From 65ca8310ad3b55a7a1b3ac68ff336119006dd120 Mon Sep 17 00:00:00 2001
From: Elijah Williams <ewilliams@cloudera.com>
Date: Wed, 7 May 2025 10:43:10 -0600
Subject: [PATCH 38/41] bump bedrock to use max tokens of 1024

---
 llm-service/app/services/models/llm.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llm-service/app/services/models/llm.py b/llm-service/app/services/models/llm.py
index a270c841..e09ec4a1 100644
--- a/llm-service/app/services/models/llm.py
+++ b/llm-service/app/services/models/llm.py
@@ -79,6 +79,7 @@ def get(cls, model_name: Optional[str] = None) -> llms.LLM:
             model=model_name,
             messages_to_prompt=messages_to_prompt,
             completion_to_prompt=completion_to_prompt,
+            max_tokens=1024,
         )
 
     @staticmethod

From 14ad9aec4c976c80fe984beb65e35bfafad23d7f Mon Sep 17 00:00:00 2001
From: jwatson <jkwatson@gmail.com>
Date: Wed, 7 May 2025 10:06:35 -0700
Subject: [PATCH 39/41] python refactoring

lastFile:llm-service/app/routers/index/sessions/__init__.py
---
 .../app/routers/index/chat/__init__.py        |   2 +-
 .../app/routers/index/sessions/__init__.py    |   4 +-
 llm-service/app/services/chat.py              | 464 ------------------
 llm-service/app/services/chat/__init__.py     |  38 ++
 llm-service/app/services/chat/chat.py         | 171 +++++++
 .../app/services/chat/streaming_chat.py       | 212 ++++++++
 .../app/services/chat/suggested_questions.py  | 158 ++++++
 llm-service/app/services/chat/utils.py        | 101 ++++
 llm-service/app/services/query/querier.py     |   2 +-
 llm-service/app/services/session.py           |   7 +-
 llm-service/app/tests/services/test_chat.py   |   2 +-
 11 files changed, 690 insertions(+), 471 deletions(-)
 delete mode 100644 llm-service/app/services/chat.py
 create mode 100644 llm-service/app/services/chat/__init__.py
 create mode 100644 llm-service/app/services/chat/chat.py
 create mode 100644 llm-service/app/services/chat/streaming_chat.py
 create mode 100644 llm-service/app/services/chat/suggested_questions.py
 create mode 100644 llm-service/app/services/chat/utils.py

diff --git a/llm-service/app/routers/index/chat/__init__.py b/llm-service/app/routers/index/chat/__init__.py
index 2302d5af..e85cdad9 100644
--- a/llm-service/app/routers/index/chat/__init__.py
+++ b/llm-service/app/routers/index/chat/__init__.py
@@ -42,7 +42,7 @@
 from pydantic import BaseModel
 
 from app import exceptions
-from app.services.chat import generate_suggested_questions
+from app.services.chat.suggested_questions import generate_suggested_questions
 
 logger = logging.getLogger(__name__)
 router = APIRouter(prefix="/chat", tags=["Chat"])
diff --git a/llm-service/app/routers/index/sessions/__init__.py b/llm-service/app/routers/index/sessions/__init__.py
index d4521dbe..bcbe03bb 100644
--- a/llm-service/app/routers/index/sessions/__init__.py
+++ b/llm-service/app/routers/index/sessions/__init__.py
@@ -44,11 +44,11 @@
 from fastapi.responses import StreamingResponse
 from pydantic import BaseModel
 
+from app.services.chat.streaming_chat import v3_chat
 from .... import exceptions
 from ....rag_types import RagPredictConfiguration
-from ....services.chat import (
+from ....services.chat.chat import (
     v2_chat,
-    v3_chat,
 )
 from ....services.chat_history.chat_history_manager import (
     RagStudioChatMessage,
diff --git a/llm-service/app/services/chat.py b/llm-service/app/services/chat.py
deleted file mode 100644
index 1744ea3a..00000000
--- a/llm-service/app/services/chat.py
+++ /dev/null
@@ -1,464 +0,0 @@
-# ##########################################################################
-#  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
-#  (C) Cloudera, Inc. 2024
-#  All rights reserved.
-#
-#  Applicable Open Source License: Apache 2.0
-#
-#  NOTE: Cloudera open source products are modular software products
-#  made up of hundreds of individual components, each of which was
-#  individually copyrighted.  Each Cloudera open source product is a
-#  collective work under U.S. Copyright Law. Your license to use the
-#  collective work is as provided in your written agreement with
-#  Cloudera.  Used apart from the collective work, this file is
-#  licensed for your use pursuant to the open source license
-#  identified above.
-#
-#  This code is provided to you pursuant a written agreement with
-#  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
-#  this code. If you do not have a written agreement with Cloudera nor
-#  with an authorized and properly licensed third party, you do not
-#  have any rights to access nor to use this code.
-#
-#  Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
-#  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
-#  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
-#  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
-#  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
-#  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
-#  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
-#  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
-#  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
-#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
-#  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
-#  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
-#  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
-#  DATA.
-# ##############################################################################
-import time
-import uuid
-from random import shuffle
-from typing import List, Iterable, Optional, Generator
-
-from fastapi import HTTPException
-from llama_index.core.base.llms.types import MessageRole, ChatResponse, ChatMessage
-from llama_index.core.chat_engine.types import AgentChatResponse
-from pydantic import BaseModel
-
-from . import evaluators, llm_completion
-from .chat_history.chat_history_manager import (
-    RagPredictSourceNode,
-    Evaluation,
-    RagMessage,
-    RagStudioChatMessage,
-    chat_history_manager,
-)
-from .metadata_apis import session_metadata_api
-from .metadata_apis.session_metadata_api import Session
-from .mlflow import record_rag_mlflow_run, record_direct_llm_mlflow_run
-from .query import querier
-from .query.query_configuration import QueryConfiguration
-from ..ai.vector_stores.vector_store_factory import VectorStoreFactory
-from ..rag_types import RagPredictConfiguration
-
-
-class RagContext(BaseModel):
-    role: MessageRole
-    content: str
-
-
-def v3_chat(
-    session: Session,
-    query: str,
-    configuration: RagPredictConfiguration,
-    user_name: Optional[str],
-) -> Generator[ChatResponse, None, None]:
-    query_configuration = QueryConfiguration(
-        top_k=session.response_chunks,
-        model_name=session.inference_model,
-        rerank_model_name=session.rerank_model,
-        exclude_knowledge_base=configuration.exclude_knowledge_base,
-        use_question_condensing=configuration.use_question_condensing,
-        use_hyde=session.query_configuration.enable_hyde,
-        use_summary_filter=session.query_configuration.enable_summary_filter,
-    )
-
-    response_id = str(uuid.uuid4())
-
-    if configuration.exclude_knowledge_base or len(session.data_source_ids) == 0:
-        return stream_direct_llm_chat(session, response_id, query, user_name)
-
-    total_data_sources_size: int = sum(
-        map(
-            lambda ds_id: VectorStoreFactory.for_chunks(ds_id).size() or 0,
-            session.data_source_ids,
-        )
-    )
-    if total_data_sources_size == 0:
-        return stream_direct_llm_chat(session, response_id, query, user_name)
-
-    return _run_streaming_chat(
-        session, response_id, query, query_configuration, user_name
-    )
-
-
-def v2_chat(
-    session: Session,
-    query: str,
-    configuration: RagPredictConfiguration,
-    user_name: Optional[str],
-) -> RagStudioChatMessage:
-    query_configuration = QueryConfiguration(
-        top_k=session.response_chunks,
-        model_name=session.inference_model,
-        rerank_model_name=session.rerank_model,
-        exclude_knowledge_base=configuration.exclude_knowledge_base,
-        use_question_condensing=configuration.use_question_condensing,
-        use_hyde=session.query_configuration.enable_hyde,
-        use_summary_filter=session.query_configuration.enable_summary_filter,
-    )
-
-    response_id = str(uuid.uuid4())
-
-    if configuration.exclude_knowledge_base or len(session.data_source_ids) == 0:
-        return direct_llm_chat(session, response_id, query, user_name)
-
-    total_data_sources_size: int = sum(
-        map(
-            lambda ds_id: VectorStoreFactory.for_chunks(ds_id).size() or 0,
-            session.data_source_ids,
-        )
-    )
-    if total_data_sources_size == 0:
-        return direct_llm_chat(session, response_id, query, user_name)
-
-    new_chat_message: RagStudioChatMessage = _run_chat(
-        session, response_id, query, query_configuration, user_name
-    )
-
-    chat_history_manager.append_to_history(session.id, [new_chat_message])
-    return new_chat_message
-
-
-def _run_streaming_chat(
-    session: Session,
-    response_id: str,
-    query: str,
-    query_configuration: QueryConfiguration,
-    user_name: Optional[str],
-) -> Generator[ChatResponse, None, None]:
-    if len(session.data_source_ids) != 1:
-        raise HTTPException(
-            status_code=400, detail="Only one datasource is supported for chat."
-        )
-
-    data_source_id: int = session.data_source_ids[0]
-    streaming_chat_response, condensed_question = querier.streaming_query(
-        data_source_id,
-        query,
-        query_configuration,
-        retrieve_chat_history(session.id),
-    )
-
-    response: ChatResponse = ChatResponse(message=ChatMessage(content=query))
-    if streaming_chat_response.chat_stream:
-        for response in streaming_chat_response.chat_stream:
-            response.additional_kwargs["response_id"] = response_id
-            yield response
-
-    chat_response = AgentChatResponse(
-        response=response.message.content or "",
-        sources=streaming_chat_response.sources,
-        source_nodes=streaming_chat_response.source_nodes,
-    )
-
-    if condensed_question and (condensed_question.strip() == query.strip()):
-        condensed_question = None
-    relevance, faithfulness = evaluators.evaluate_response(
-        query, chat_response, session.inference_model
-    )
-    response_source_nodes = format_source_nodes(chat_response, data_source_id)
-    new_chat_message = RagStudioChatMessage(
-        id=response_id,
-        session_id=session.id,
-        source_nodes=response_source_nodes,
-        inference_model=session.inference_model,
-        rag_message=RagMessage(
-            user=query,
-            assistant=chat_response.response,
-        ),
-        evaluations=[
-            Evaluation(name="relevance", value=relevance),
-            Evaluation(name="faithfulness", value=faithfulness),
-        ],
-        timestamp=time.time(),
-        condensed_question=condensed_question,
-    )
-
-    chat_history_manager.append_to_history(session.id, [new_chat_message])
-
-    record_rag_mlflow_run(
-        new_chat_message, query_configuration, response_id, session, user_name
-    )
-
-
-def _run_chat(
-    session: Session,
-    response_id: str,
-    query: str,
-    query_configuration: QueryConfiguration,
-    user_name: Optional[str],
-) -> RagStudioChatMessage:
-    if len(session.data_source_ids) != 1:
-        raise HTTPException(
-            status_code=400, detail="Only one datasource is supported for chat."
-        )
-
-    data_source_id: int = session.data_source_ids[0]
-    response, condensed_question = querier.query(
-        data_source_id,
-        query,
-        query_configuration,
-        retrieve_chat_history(session.id),
-    )
-    if condensed_question and (condensed_question.strip() == query.strip()):
-        condensed_question = None
-    relevance, faithfulness = evaluators.evaluate_response(
-        query, response, session.inference_model
-    )
-    response_source_nodes = format_source_nodes(response, data_source_id)
-    new_chat_message = RagStudioChatMessage(
-        id=response_id,
-        session_id=session.id,
-        source_nodes=response_source_nodes,
-        inference_model=session.inference_model,
-        rag_message=RagMessage(
-            user=query,
-            assistant=response.response,
-        ),
-        evaluations=[
-            Evaluation(name="relevance", value=relevance),
-            Evaluation(name="faithfulness", value=faithfulness),
-        ],
-        timestamp=time.time(),
-        condensed_question=condensed_question,
-    )
-
-    record_rag_mlflow_run(
-        new_chat_message, query_configuration, response_id, session, user_name
-    )
-    return new_chat_message
-
-
-def retrieve_chat_history(session_id: int) -> List[RagContext]:
-    chat_history = chat_history_manager.retrieve_chat_history(session_id)[:10]
-    history: List[RagContext] = []
-    for message in chat_history:
-        history.append(
-            RagContext(role=MessageRole.USER, content=message.rag_message.user)
-        )
-        history.append(
-            RagContext(
-                role=MessageRole.ASSISTANT, content=message.rag_message.assistant
-            )
-        )
-    return history
-
-
-def format_source_nodes(
-    response: AgentChatResponse, data_source_id: int
-) -> List[RagPredictSourceNode]:
-    response_source_nodes = []
-    for source_node in response.source_nodes:
-        doc_id = source_node.node.metadata.get("document_id", source_node.node.node_id)
-        response_source_nodes.append(
-            RagPredictSourceNode(
-                node_id=source_node.node.node_id,
-                doc_id=doc_id,
-                source_file_name=source_node.node.metadata["file_name"],
-                score=source_node.score or 0.0,
-                dataSourceId=data_source_id,
-            )
-        )
-    response_source_nodes = sorted(
-        response_source_nodes, key=lambda x: x.score, reverse=True
-    )
-    return response_source_nodes
-
-
-SAMPLE_QUESTIONS = [
-    "What is Cloudera, and how does it support organizations in managing big data?",
-    "What are the key components of the Cloudera Data Platform (CDP), and how do they work together?",
-    "How does Cloudera enable hybrid and multi-cloud data management for enterprises?",
-    "What are the primary use cases for Cloudera's platform in industries such as finance, healthcare, and retail?",
-    "How does Cloudera ensure data security and compliance with regulations like GDPR, HIPAA, and CCPA?",
-    "What is the role of Apache Hadoop and Apache Spark in Cloudera's ecosystem, and how do they contribute to data processing?",
-    "How does Cloudera's platform support machine learning and artificial intelligence workflows?",
-    "What are the differences between Cloudera Data Platform (CDP) Public Cloud and CDP Private Cloud?",
-    "How does Cloudera's platform handle data ingestion, storage, and real-time analytics at scale?",
-    "What tools and features does Cloudera provide for data governance, lineage, and cataloging?,",
-]
-
-
-def generate_dummy_suggested_questions() -> List[str]:
-    questions = SAMPLE_QUESTIONS.copy()
-    shuffle(questions)
-    return questions[:4]
-
-
-def _generate_suggested_questions_direct_llm(session: Session) -> List[str]:
-    chat_history = retrieve_chat_history(session.id)
-    if not chat_history:
-        return generate_dummy_suggested_questions()
-    query_str = (
-        " Give me a list of possible follow-up questions."
-        " Each question should be on a new line."
-        " There should be no more than four (4) questions."
-        " Each question should be no longer than fifteen (15) words."
-        " The response should be a bulleted list, using an asterisk (*) to denote the bullet item."
-        " Do not start like this - `Here are four questions that I can answer based on the context information`"
-        " Only return the list."
-    )
-    chat_response = llm_completion.completion(
-        session.id, query_str, session.inference_model
-    )
-    suggested_questions = process_response(chat_response.message.content)
-    return suggested_questions
-
-
-def generate_suggested_questions(
-    session_id: Optional[int],
-    user_name: Optional[str] = None,
-) -> List[str]:
-    if session_id is None:
-        return generate_dummy_suggested_questions()
-    session = session_metadata_api.get_session(session_id, user_name)
-    if len(session.data_source_ids) == 0:
-        return _generate_suggested_questions_direct_llm(session)
-    if len(session.data_source_ids) != 1:
-        raise HTTPException(
-            status_code=400,
-            detail="Only one datasource is supported for question suggestion.",
-        )
-    data_source_id = session.data_source_ids[0]
-
-    total_data_sources_size: int = sum(
-        map(
-            lambda ds_id: VectorStoreFactory.for_chunks(ds_id).size() or 0,
-            session.data_source_ids,
-        )
-    )
-    if total_data_sources_size == 0:
-        return _generate_suggested_questions_direct_llm(session)
-        # raise HTTPException(status_code=404, detail="Knowledge base not found.")
-
-    chat_history = retrieve_chat_history(session_id)
-    if total_data_sources_size == 0:
-        suggested_questions = []
-    else:
-        query_str = (
-            "Give me a list of questions that you can answer."
-            " Each question should be on a new line."
-            " There should be no more than four (4) questions."
-            " Each question should be no longer than fifteen (15) words."
-            " The response should be a bulleted list, using an asterisk (*) to denote the bullet item."
-            " Do not return questions based on the metadata of the document. Only the content."
-            " Do not start like this - `Here are four questions that I can answer based on the context information`"
-            " Only return the list."
-        )
-        if chat_history:
-            query_str = (
-                query_str
-                + (
-                    "I will provide a response from my last question to help with generating new questions."
-                    " Consider returning questions that are relevant to the response"
-                    " They might be follow up questions or questions that are related to the response."
-                    " Here is the last response received:\n"
-                )
-                + chat_history[-1].content
-            )
-        response, _ = querier.query(
-            data_source_id,
-            query_str,
-            QueryConfiguration(
-                top_k=session.response_chunks,
-                model_name=session.inference_model,
-                rerank_model_name=None,
-                exclude_knowledge_base=False,
-                use_question_condensing=False,
-                use_hyde=False,
-                use_postprocessor=False,
-            ),
-            [],
-        )
-        suggested_questions = process_response(response.response)
-    return suggested_questions
-
-
-def process_response(response: str | None) -> list[str]:
-    if response is None:
-        return []
-
-    sentences: Iterable[str] = response.splitlines()
-    sentences = map(lambda x: x.strip(), sentences)
-    sentences = map(lambda x: x.removeprefix("*").strip(), sentences)
-    sentences = map(lambda x: x.removeprefix("-").strip(), sentences)
-    sentences = map(lambda x: x.strip("*"), sentences)
-    sentences = filter(lambda x: len(x.split()) <= 60, sentences)
-    sentences = filter(lambda x: x != "Empty Response", sentences)
-    sentences = filter(lambda x: x != "", sentences)
-    return list(sentences)[:5]
-
-
-def direct_llm_chat(
-    session: Session, response_id: str, query: str, user_name: Optional[str]
-) -> RagStudioChatMessage:
-    record_direct_llm_mlflow_run(response_id, session, user_name)
-
-    chat_response = llm_completion.completion(
-        session.id, query, session.inference_model
-    )
-    new_chat_message = RagStudioChatMessage(
-        id=response_id,
-        session_id=session.id,
-        source_nodes=[],
-        inference_model=session.inference_model,
-        evaluations=[],
-        rag_message=RagMessage(
-            user=query,
-            assistant=str(chat_response.message.content),
-        ),
-        timestamp=time.time(),
-        condensed_question=None,
-    )
-    chat_history_manager.append_to_history(session.id, [new_chat_message])
-    return new_chat_message
-
-
-def stream_direct_llm_chat(
-    session: Session, response_id: str, query: str, user_name: Optional[str]
-) -> Generator[ChatResponse, None, None]:
-    record_direct_llm_mlflow_run(response_id, session, user_name)
-
-    chat_response = llm_completion.stream_completion(
-        session.id, query, session.inference_model
-    )
-    response: ChatResponse = ChatResponse(message=ChatMessage(content=query))
-    for response in chat_response:
-        response.additional_kwargs["response_id"] = response_id
-        yield response
-
-    new_chat_message = RagStudioChatMessage(
-        id=response_id,
-        session_id=session.id,
-        source_nodes=[],
-        inference_model=session.inference_model,
-        evaluations=[],
-        rag_message=RagMessage(
-            user=query,
-            assistant=response.message.content or "",
-        ),
-        timestamp=time.time(),
-        condensed_question=None,
-    )
-    chat_history_manager.append_to_history(session.id, [new_chat_message])
diff --git a/llm-service/app/services/chat/__init__.py b/llm-service/app/services/chat/__init__.py
new file mode 100644
index 00000000..9c598784
--- /dev/null
+++ b/llm-service/app/services/chat/__init__.py
@@ -0,0 +1,38 @@
+#
+#  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
+#  (C) Cloudera, Inc. 2025
+#  All rights reserved.
+#
+#  Applicable Open Source License: Apache 2.0
+#
+#  NOTE: Cloudera open source products are modular software products
+#  made up of hundreds of individual components, each of which was
+#  individually copyrighted.  Each Cloudera open source product is a
+#  collective work under U.S. Copyright Law. Your license to use the
+#  collective work is as provided in your written agreement with
+#  Cloudera.  Used apart from the collective work, this file is
+#  licensed for your use pursuant to the open source license
+#  identified above.
+#
+#  This code is provided to you pursuant a written agreement with
+#  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
+#  this code. If you do not have a written agreement with Cloudera nor
+#  with an authorized and properly licensed third party, you do not
+#  have any rights to access nor to use this code.
+#
+#  Absent a written agreement with Cloudera, Inc. ("Cloudera") to the
+#  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
+#  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
+#  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
+#  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
+#  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
+#  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
+#  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
+#  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
+#  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
+#  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
+#  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
+#  DATA.
+#
+
diff --git a/llm-service/app/services/chat/chat.py b/llm-service/app/services/chat/chat.py
new file mode 100644
index 00000000..b8c0b08f
--- /dev/null
+++ b/llm-service/app/services/chat/chat.py
@@ -0,0 +1,171 @@
+#
+#  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
+#  (C) Cloudera, Inc. 2025
+#  All rights reserved.
+#
+#  Applicable Open Source License: Apache 2.0
+#
+#  NOTE: Cloudera open source products are modular software products
+#  made up of hundreds of individual components, each of which was
+#  individually copyrighted.  Each Cloudera open source product is a
+#  collective work under U.S. Copyright Law. Your license to use the
+#  collective work is as provided in your written agreement with
+#  Cloudera.  Used apart from the collective work, this file is
+#  licensed for your use pursuant to the open source license
+#  identified above.
+#
+#  This code is provided to you pursuant a written agreement with
+#  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
+#  this code. If you do not have a written agreement with Cloudera nor
+#  with an authorized and properly licensed third party, you do not
+#  have any rights to access nor to use this code.
+#
+#  Absent a written agreement with Cloudera, Inc. ("Cloudera") to the
+#  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
+#  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
+#  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
+#  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
+#  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
+#  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
+#  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
+#  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
+#  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
+#  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
+#  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
+#  DATA.
+#
+
+import time
+import uuid
+from typing import Optional
+
+from fastapi import HTTPException
+
+from app.services import evaluators, llm_completion
+from app.services.chat.utils import retrieve_chat_history, format_source_nodes
+from app.services.chat_history.chat_history_manager import (
+    Evaluation,
+    RagMessage,
+    RagStudioChatMessage,
+    chat_history_manager,
+)
+from app.services.metadata_apis.session_metadata_api import Session
+from app.services.mlflow import record_rag_mlflow_run, record_direct_llm_mlflow_run
+from app.services.query import querier
+from app.services.query.query_configuration import QueryConfiguration
+from app.ai.vector_stores.vector_store_factory import VectorStoreFactory
+from app.rag_types import RagPredictConfiguration
+
+
+def v2_chat(
+    session: Session,
+    query: str,
+    configuration: RagPredictConfiguration,
+    user_name: Optional[str],
+) -> RagStudioChatMessage:
+    query_configuration = QueryConfiguration(
+        top_k=session.response_chunks,
+        model_name=session.inference_model,
+        rerank_model_name=session.rerank_model,
+        exclude_knowledge_base=configuration.exclude_knowledge_base,
+        use_question_condensing=configuration.use_question_condensing,
+        use_hyde=session.query_configuration.enable_hyde,
+        use_summary_filter=session.query_configuration.enable_summary_filter,
+    )
+
+    response_id = str(uuid.uuid4())
+
+    if configuration.exclude_knowledge_base or len(session.data_source_ids) == 0:
+        return direct_llm_chat(session, response_id, query, user_name)
+
+    total_data_sources_size: int = sum(
+        map(
+            lambda ds_id: VectorStoreFactory.for_chunks(ds_id).size() or 0,
+            session.data_source_ids,
+        )
+    )
+    if total_data_sources_size == 0:
+        return direct_llm_chat(session, response_id, query, user_name)
+
+    new_chat_message: RagStudioChatMessage = _run_chat(
+        session, response_id, query, query_configuration, user_name
+    )
+
+    chat_history_manager.append_to_history(session.id, [new_chat_message])
+    return new_chat_message
+
+
+def _run_chat(
+    session: Session,
+    response_id: str,
+    query: str,
+    query_configuration: QueryConfiguration,
+    user_name: Optional[str],
+) -> RagStudioChatMessage:
+    if len(session.data_source_ids) != 1:
+        raise HTTPException(
+            status_code=400, detail="Only one datasource is supported for chat."
+        )
+
+    data_source_id: int = session.data_source_ids[0]
+    response, condensed_question = querier.query(
+        data_source_id,
+        query,
+        query_configuration,
+        retrieve_chat_history(session.id),
+    )
+    if condensed_question and (condensed_question.strip() == query.strip()):
+        condensed_question = None
+    relevance, faithfulness = evaluators.evaluate_response(
+        query, response, session.inference_model
+    )
+    response_source_nodes = format_source_nodes(response, data_source_id)
+    new_chat_message = RagStudioChatMessage(
+        id=response_id,
+        session_id=session.id,
+        source_nodes=response_source_nodes,
+        inference_model=session.inference_model,
+        rag_message=RagMessage(
+            user=query,
+            assistant=response.response,
+        ),
+        evaluations=[
+            Evaluation(name="relevance", value=relevance),
+            Evaluation(name="faithfulness", value=faithfulness),
+        ],
+        timestamp=time.time(),
+        condensed_question=condensed_question,
+    )
+
+    record_rag_mlflow_run(
+        new_chat_message, query_configuration, response_id, session, user_name
+    )
+    return new_chat_message
+
+
+def direct_llm_chat(
+    session: Session, response_id: str, query: str, user_name: Optional[str]
+) -> RagStudioChatMessage:
+    record_direct_llm_mlflow_run(response_id, session, user_name)
+
+    chat_response = llm_completion.completion(
+        session.id, query, session.inference_model
+    )
+    new_chat_message = RagStudioChatMessage(
+        id=response_id,
+        session_id=session.id,
+        source_nodes=[],
+        inference_model=session.inference_model,
+        evaluations=[],
+        rag_message=RagMessage(
+            user=query,
+            assistant=str(chat_response.message.content),
+        ),
+        timestamp=time.time(),
+        condensed_question=None,
+    )
+    chat_history_manager.append_to_history(session.id, [new_chat_message])
+    return new_chat_message
+
+
diff --git a/llm-service/app/services/chat/streaming_chat.py b/llm-service/app/services/chat/streaming_chat.py
new file mode 100644
index 00000000..6b9fe975
--- /dev/null
+++ b/llm-service/app/services/chat/streaming_chat.py
@@ -0,0 +1,212 @@
+#
+#  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
+#  (C) Cloudera, Inc. 2025
+#  All rights reserved.
+#
+#  Applicable Open Source License: Apache 2.0
+#
+#  NOTE: Cloudera open source products are modular software products
+#  made up of hundreds of individual components, each of which was
+#  individually copyrighted.  Each Cloudera open source product is a
+#  collective work under U.S. Copyright Law. Your license to use the
+#  collective work is as provided in your written agreement with
+#  Cloudera.  Used apart from the collective work, this file is
+#  licensed for your use pursuant to the open source license
+#  identified above.
+#
+#  This code is provided to you pursuant a written agreement with
+#  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
+#  this code. If you do not have a written agreement with Cloudera nor
+#  with an authorized and properly licensed third party, you do not
+#  have any rights to access nor to use this code.
+#
+#  Absent a written agreement with Cloudera, Inc. ("Cloudera") to the
+#  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
+#  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
+#  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
+#  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
+#  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
+#  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
+#  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
+#  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
+#  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
+#  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
+#  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
+#  DATA.
+#
+
+#
+#  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
+#  (C) Cloudera, Inc. 2025
+#  All rights reserved.
+#
+#  Applicable Open Source License: Apache 2.0
+#
+#
+#  This code is provided to you pursuant a written agreement with
+#  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
+#  this code. If you do not have a written agreement with Cloudera nor
+#  with an authorized and properly licensed third party, you do not
+#  have any rights to access nor to use this code.
+#
+#  Absent a written agreement with Cloudera, Inc. ("Cloudera") to the
+#  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
+#  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
+#  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
+#  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
+#  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
+#  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
+#  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
+#  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
+#  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
+#  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
+#  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
+#  DATA.
+#
+
+import time
+import uuid
+from typing import Optional, Generator
+
+from fastapi import HTTPException
+from llama_index.core.base.llms.types import ChatResponse, ChatMessage
+from llama_index.core.chat_engine.types import AgentChatResponse
+
+from app.ai.vector_stores.vector_store_factory import VectorStoreFactory
+from app.rag_types import RagPredictConfiguration
+from app.services import evaluators, llm_completion
+from app.services.chat.utils import retrieve_chat_history, format_source_nodes
+from app.services.chat_history.chat_history_manager import RagStudioChatMessage, RagMessage, Evaluation, \
+    chat_history_manager
+from app.services.metadata_apis.session_metadata_api import Session
+from app.services.mlflow import record_rag_mlflow_run, record_direct_llm_mlflow_run
+from app.services.query import querier
+from app.services.query.query_configuration import QueryConfiguration
+
+
+def v3_chat(
+    session: Session,
+    query: str,
+    configuration: RagPredictConfiguration,
+    user_name: Optional[str],
+) -> Generator[ChatResponse, None, None]:
+    query_configuration = QueryConfiguration(
+        top_k=session.response_chunks,
+        model_name=session.inference_model,
+        rerank_model_name=session.rerank_model,
+        exclude_knowledge_base=configuration.exclude_knowledge_base,
+        use_question_condensing=configuration.use_question_condensing,
+        use_hyde=session.query_configuration.enable_hyde,
+        use_summary_filter=session.query_configuration.enable_summary_filter,
+    )
+
+    response_id = str(uuid.uuid4())
+
+    if configuration.exclude_knowledge_base or len(session.data_source_ids) == 0:
+        return _stream_direct_llm_chat(session, response_id, query, user_name)
+
+    total_data_sources_size: int = sum(
+        map(
+            lambda ds_id: VectorStoreFactory.for_chunks(ds_id).size() or 0,
+            session.data_source_ids,
+        )
+    )
+    if total_data_sources_size == 0:
+        return _stream_direct_llm_chat(session, response_id, query, user_name)
+
+    return _run_streaming_chat(
+        session, response_id, query, query_configuration, user_name
+    )
+
+
+def _run_streaming_chat(
+    session: Session,
+    response_id: str,
+    query: str,
+    query_configuration: QueryConfiguration,
+    user_name: Optional[str],
+) -> Generator[ChatResponse, None, None]:
+    if len(session.data_source_ids) != 1:
+        raise HTTPException(
+            status_code=400, detail="Only one datasource is supported for chat."
+        )
+
+    data_source_id: int = session.data_source_ids[0]
+    streaming_chat_response, condensed_question = querier.streaming_query(
+        data_source_id,
+        query,
+        query_configuration,
+        retrieve_chat_history(session.id),
+    )
+
+    response: ChatResponse = ChatResponse(message=ChatMessage(content=query))
+    if streaming_chat_response.chat_stream:
+        for response in streaming_chat_response.chat_stream:
+            response.additional_kwargs["response_id"] = response_id
+            yield response
+
+    chat_response = AgentChatResponse(
+        response=response.message.content or "",
+        sources=streaming_chat_response.sources,
+        source_nodes=streaming_chat_response.source_nodes,
+    )
+
+    if condensed_question and (condensed_question.strip() == query.strip()):
+        condensed_question = None
+    relevance, faithfulness = evaluators.evaluate_response(
+        query, chat_response, session.inference_model
+    )
+    response_source_nodes = format_source_nodes(chat_response, data_source_id)
+    new_chat_message = RagStudioChatMessage(
+        id=response_id,
+        session_id=session.id,
+        source_nodes=response_source_nodes,
+        inference_model=session.inference_model,
+        rag_message=RagMessage(
+            user=query,
+            assistant=chat_response.response,
+        ),
+        evaluations=[
+            Evaluation(name="relevance", value=relevance),
+            Evaluation(name="faithfulness", value=faithfulness),
+        ],
+        timestamp=time.time(),
+        condensed_question=condensed_question,
+    )
+
+    chat_history_manager.append_to_history(session.id, [new_chat_message])
+
+    record_rag_mlflow_run(
+        new_chat_message, query_configuration, response_id, session, user_name
+    )
+
+
+def _stream_direct_llm_chat(
+    session: Session, response_id: str, query: str, user_name: Optional[str]
+) -> Generator[ChatResponse, None, None]:
+    record_direct_llm_mlflow_run(response_id, session, user_name)
+
+    chat_response = llm_completion.stream_completion(
+        session.id, query, session.inference_model
+    )
+    response: ChatResponse = ChatResponse(message=ChatMessage(content=query))
+    for response in chat_response:
+        response.additional_kwargs["response_id"] = response_id
+        yield response
+
+    new_chat_message = RagStudioChatMessage(
+        id=response_id,
+        session_id=session.id,
+        source_nodes=[],
+        inference_model=session.inference_model,
+        evaluations=[],
+        rag_message=RagMessage(
+            user=query,
+            assistant=response.message.content or "",
+        ),
+        timestamp=time.time(),
+        condensed_question=None,
+    )
+    chat_history_manager.append_to_history(session.id, [new_chat_message])
diff --git a/llm-service/app/services/chat/suggested_questions.py b/llm-service/app/services/chat/suggested_questions.py
new file mode 100644
index 00000000..bdd409e6
--- /dev/null
+++ b/llm-service/app/services/chat/suggested_questions.py
@@ -0,0 +1,158 @@
+#
+#  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
+#  (C) Cloudera, Inc. 2025
+#  All rights reserved.
+#
+#  Applicable Open Source License: Apache 2.0
+#
+#  NOTE: Cloudera open source products are modular software products
+#  made up of hundreds of individual components, each of which was
+#  individually copyrighted.  Each Cloudera open source product is a
+#  collective work under U.S. Copyright Law. Your license to use the
+#  collective work is as provided in your written agreement with
+#  Cloudera.  Used apart from the collective work, this file is
+#  licensed for your use pursuant to the open source license
+#  identified above.
+#
+#  This code is provided to you pursuant a written agreement with
+#  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
+#  this code. If you do not have a written agreement with Cloudera nor
+#  with an authorized and properly licensed third party, you do not
+#  have any rights to access nor to use this code.
+#
+#  Absent a written agreement with Cloudera, Inc. ("Cloudera") to the
+#  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
+#  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
+#  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
+#  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
+#  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
+#  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
+#  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
+#  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
+#  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
+#  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
+#  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
+#  DATA.
+#
+
+from random import shuffle
+from typing import List, Optional
+
+from fastapi import HTTPException
+
+from app.ai.vector_stores.vector_store_factory import VectorStoreFactory
+from app.services import llm_completion
+from app.services.chat.utils import retrieve_chat_history, process_response
+from app.services.metadata_apis import session_metadata_api
+from app.services.metadata_apis.session_metadata_api import Session
+from app.services.query import querier
+from app.services.query.query_configuration import QueryConfiguration
+
+SAMPLE_QUESTIONS = [
+    "What is Cloudera, and how does it support organizations in managing big data?",
+    "What are the key components of the Cloudera Data Platform (CDP), and how do they work together?",
+    "How does Cloudera enable hybrid and multi-cloud data management for enterprises?",
+    "What are the primary use cases for Cloudera's platform in industries such as finance, healthcare, and retail?",
+    "How does Cloudera ensure data security and compliance with regulations like GDPR, HIPAA, and CCPA?",
+    "What is the role of Apache Hadoop and Apache Spark in Cloudera's ecosystem, and how do they contribute to data processing?",
+    "How does Cloudera's platform support machine learning and artificial intelligence workflows?",
+    "What are the differences between Cloudera Data Platform (CDP) Public Cloud and CDP Private Cloud?",
+    "How does Cloudera's platform handle data ingestion, storage, and real-time analytics at scale?",
+    "What tools and features does Cloudera provide for data governance, lineage, and cataloging?,",
+]
+
+
+def generate_dummy_suggested_questions() -> List[str]:
+    questions = SAMPLE_QUESTIONS.copy()
+    shuffle(questions)
+    return questions[:4]
+
+
+def _generate_suggested_questions_direct_llm(session: Session) -> List[str]:
+    chat_history = retrieve_chat_history(session.id)
+    if not chat_history:
+        return generate_dummy_suggested_questions()
+    query_str = (
+        " Give me a list of possible follow-up questions."
+        " Each question should be on a new line."
+        " There should be no more than four (4) questions."
+        " Each question should be no longer than fifteen (15) words."
+        " The response should be a bulleted list, using an asterisk (*) to denote the bullet item."
+        " Do not start like this - `Here are four questions that I can answer based on the context information`"
+        " Only return the list."
+    )
+    chat_response = llm_completion.completion(
+        session.id, query_str, session.inference_model
+    )
+    suggested_questions = process_response(chat_response.message.content)
+    return suggested_questions
+
+
+def generate_suggested_questions(
+    session_id: Optional[int],
+    user_name: Optional[str] = None,
+) -> List[str]:
+    if session_id is None:
+        return generate_dummy_suggested_questions()
+    session = session_metadata_api.get_session(session_id, user_name)
+    if len(session.data_source_ids) == 0:
+        return _generate_suggested_questions_direct_llm(session)
+    if len(session.data_source_ids) != 1:
+        raise HTTPException(
+            status_code=400,
+            detail="Only one datasource is supported for question suggestion.",
+        )
+    data_source_id = session.data_source_ids[0]
+
+    total_data_sources_size: int = sum(
+        map(
+            lambda ds_id: VectorStoreFactory.for_chunks(ds_id).size() or 0,
+            session.data_source_ids,
+        )
+    )
+    if total_data_sources_size == 0:
+        return _generate_suggested_questions_direct_llm(session)
+        # raise HTTPException(status_code=404, detail="Knowledge base not found.")
+
+    chat_history = retrieve_chat_history(session_id)
+    if total_data_sources_size == 0:
+        suggested_questions = []
+    else:
+        query_str = (
+            "Give me a list of questions that you can answer."
+            " Each question should be on a new line."
+            " There should be no more than four (4) questions."
+            " Each question should be no longer than fifteen (15) words."
+            " The response should be a bulleted list, using an asterisk (*) to denote the bullet item."
+            " Do not return questions based on the metadata of the document. Only the content."
+            " Do not start like this - `Here are four questions that I can answer based on the context information`"
+            " Only return the list."
+        )
+        if chat_history:
+            query_str = (
+                query_str
+                + (
+                    "I will provide a response from my last question to help with generating new questions."
+                    " Consider returning questions that are relevant to the response"
+                    " They might be follow up questions or questions that are related to the response."
+                    " Here is the last response received:\n"
+                )
+                + chat_history[-1].content
+            )
+        response, _ = querier.query(
+            data_source_id,
+            query_str,
+            QueryConfiguration(
+                top_k=session.response_chunks,
+                model_name=session.inference_model,
+                rerank_model_name=None,
+                exclude_knowledge_base=False,
+                use_question_condensing=False,
+                use_hyde=False,
+                use_postprocessor=False,
+            ),
+            [],
+        )
+        suggested_questions = process_response(response.response)
+    return suggested_questions
diff --git a/llm-service/app/services/chat/utils.py b/llm-service/app/services/chat/utils.py
new file mode 100644
index 00000000..f626be38
--- /dev/null
+++ b/llm-service/app/services/chat/utils.py
@@ -0,0 +1,101 @@
+#
+#  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
+#  (C) Cloudera, Inc. 2025
+#  All rights reserved.
+#
+#  Applicable Open Source License: Apache 2.0
+#
+#  NOTE: Cloudera open source products are modular software products
+#  made up of hundreds of individual components, each of which was
+#  individually copyrighted.  Each Cloudera open source product is a
+#  collective work under U.S. Copyright Law. Your license to use the
+#  collective work is as provided in your written agreement with
+#  Cloudera.  Used apart from the collective work, this file is
+#  licensed for your use pursuant to the open source license
+#  identified above.
+#
+#  This code is provided to you pursuant a written agreement with
+#  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
+#  this code. If you do not have a written agreement with Cloudera nor
+#  with an authorized and properly licensed third party, you do not
+#  have any rights to access nor to use this code.
+#
+#  Absent a written agreement with Cloudera, Inc. ("Cloudera") to the
+#  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
+#  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
+#  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
+#  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
+#  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
+#  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
+#  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
+#  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
+#  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
+#  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
+#  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
+#  DATA.
+#
+
+from typing import List, Iterable
+
+from llama_index.core.base.llms.types import MessageRole
+from llama_index.core.chat_engine.types import AgentChatResponse
+from pydantic import BaseModel
+
+from app.services.chat_history.chat_history_manager import chat_history_manager, RagPredictSourceNode
+
+
+class RagContext(BaseModel):
+    role: MessageRole
+    content: str
+
+
+def retrieve_chat_history(session_id: int) -> List[RagContext]:
+    chat_history = chat_history_manager.retrieve_chat_history(session_id)[:10]
+    history: List[RagContext] = []
+    for message in chat_history:
+        history.append(
+            RagContext(role=MessageRole.USER, content=message.rag_message.user)
+        )
+        history.append(
+            RagContext(
+                role=MessageRole.ASSISTANT, content=message.rag_message.assistant
+            )
+        )
+    return history
+
+
+def format_source_nodes(
+    response: AgentChatResponse, data_source_id: int
+) -> List[RagPredictSourceNode]:
+    response_source_nodes = []
+    for source_node in response.source_nodes:
+        doc_id = source_node.node.metadata.get("document_id", source_node.node.node_id)
+        response_source_nodes.append(
+            RagPredictSourceNode(
+                node_id=source_node.node.node_id,
+                doc_id=doc_id,
+                source_file_name=source_node.node.metadata["file_name"],
+                score=source_node.score or 0.0,
+                dataSourceId=data_source_id,
+            )
+        )
+    response_source_nodes = sorted(
+        response_source_nodes, key=lambda x: x.score, reverse=True
+    )
+    return response_source_nodes
+
+
+def process_response(response: str | None) -> list[str]:
+    if response is None:
+        return []
+
+    sentences: Iterable[str] = response.splitlines()
+    sentences = map(lambda x: x.strip(), sentences)
+    sentences = map(lambda x: x.removeprefix("*").strip(), sentences)
+    sentences = map(lambda x: x.removeprefix("-").strip(), sentences)
+    sentences = map(lambda x: x.strip("*"), sentences)
+    sentences = filter(lambda x: len(x.split()) <= 60, sentences)
+    sentences = filter(lambda x: x != "Empty Response", sentences)
+    sentences = filter(lambda x: x != "", sentences)
+    return list(sentences)[:5]
diff --git a/llm-service/app/services/query/querier.py b/llm-service/app/services/query/querier.py
index a713e069..717e8e74 100644
--- a/llm-service/app/services/query/querier.py
+++ b/llm-service/app/services/query/querier.py
@@ -32,7 +32,7 @@
 import typing
 
 if typing.TYPE_CHECKING:
-    from ..chat import RagContext
+    from ..chat.utils import RagContext
 
 import logging
 from typing import List, Optional
diff --git a/llm-service/app/services/session.py b/llm-service/app/services/session.py
index 274780ca..7020dfd1 100644
--- a/llm-service/app/services/session.py
+++ b/llm-service/app/services/session.py
@@ -40,7 +40,10 @@
 from fastapi import HTTPException
 
 from . import models
-from .chat_history.chat_history_manager import chat_history_manager
+from .chat_history.chat_history_manager import (
+    chat_history_manager,
+    RagStudioChatMessage,
+)
 from .metadata_apis import session_metadata_api
 
 RENAME_SESSION_PROMPT_TEMPLATE = """
@@ -78,7 +81,7 @@
 
 
 def rename_session(session_id: int, user_name: Optional[str]) -> str:
-    chat_history = chat_history_manager.retrieve_chat_history(session_id=session_id)
+    chat_history: list[RagStudioChatMessage] = chat_history_manager.retrieve_chat_history(session_id=session_id)
     if not chat_history:
         raise HTTPException(status_code=400, detail="No chat history found")
     first_interaction = chat_history[0].rag_message
diff --git a/llm-service/app/tests/services/test_chat.py b/llm-service/app/tests/services/test_chat.py
index 32e4ae14..ce0136f5 100644
--- a/llm-service/app/tests/services/test_chat.py
+++ b/llm-service/app/tests/services/test_chat.py
@@ -40,7 +40,7 @@
 from hypothesis import example, given
 from hypothesis import strategies as st
 
-from app.services.chat import process_response
+from app.services.chat.utils import process_response
 
 
 @st.composite

From dd36cc0c35780bee7bc6754ee3a9c441fb1c8bad Mon Sep 17 00:00:00 2001
From: Baasit Sharief <baasitsharief@gmail.com>
Date: Wed, 7 May 2025 10:18:51 -0700
Subject: [PATCH 40/41] mob next [ci-skip] [ci skip] [skip ci]

lastFile:llm-service/app/routers/index/sessions/__init__.py
---
 .../app/routers/index/sessions/__init__.py    |  8 ++---
 llm-service/app/services/chat/chat.py         |  2 +-
 .../app/services/chat/streaming_chat.py       | 32 +------------------
 3 files changed, 6 insertions(+), 36 deletions(-)

diff --git a/llm-service/app/routers/index/sessions/__init__.py b/llm-service/app/routers/index/sessions/__init__.py
index bcbe03bb..7b350e63 100644
--- a/llm-service/app/routers/index/sessions/__init__.py
+++ b/llm-service/app/routers/index/sessions/__init__.py
@@ -44,11 +44,11 @@
 from fastapi.responses import StreamingResponse
 from pydantic import BaseModel
 
-from app.services.chat.streaming_chat import v3_chat
+from app.services.chat.streaming_chat import stream_chat
 from .... import exceptions
 from ....rag_types import RagPredictConfiguration
 from ....services.chat.chat import (
-    v2_chat,
+    chat as run_chat,
 )
 from ....services.chat_history.chat_history_manager import (
     RagStudioChatMessage,
@@ -211,7 +211,7 @@ def chat(
     session = session_metadata_api.get_session(session_id, user_name=origin_remote_user)
 
     configuration = request.configuration or RagPredictConfiguration()
-    return v2_chat(session, request.query, configuration, user_name=origin_remote_user)
+    return run_chat(session, request.query, configuration, user_name=origin_remote_user)
 
 
 @router.post(
@@ -229,7 +229,7 @@ def stream_chat_completion(
     def generate_stream() -> Generator[str, None, None]:
         response_id: str = ""
         try:
-            for response in v3_chat(
+            for response in stream_chat(
                 session, request.query, configuration, user_name=origin_remote_user
             ):
                 print(response)
diff --git a/llm-service/app/services/chat/chat.py b/llm-service/app/services/chat/chat.py
index b8c0b08f..0256d4ae 100644
--- a/llm-service/app/services/chat/chat.py
+++ b/llm-service/app/services/chat/chat.py
@@ -58,7 +58,7 @@
 from app.rag_types import RagPredictConfiguration
 
 
-def v2_chat(
+def chat(
     session: Session,
     query: str,
     configuration: RagPredictConfiguration,
diff --git a/llm-service/app/services/chat/streaming_chat.py b/llm-service/app/services/chat/streaming_chat.py
index 6b9fe975..5cdad4c5 100644
--- a/llm-service/app/services/chat/streaming_chat.py
+++ b/llm-service/app/services/chat/streaming_chat.py
@@ -36,36 +36,6 @@
 #  DATA.
 #
 
-#
-#  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
-#  (C) Cloudera, Inc. 2025
-#  All rights reserved.
-#
-#  Applicable Open Source License: Apache 2.0
-#
-#
-#  This code is provided to you pursuant a written agreement with
-#  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
-#  this code. If you do not have a written agreement with Cloudera nor
-#  with an authorized and properly licensed third party, you do not
-#  have any rights to access nor to use this code.
-#
-#  Absent a written agreement with Cloudera, Inc. ("Cloudera") to the
-#  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
-#  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
-#  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
-#  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
-#  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
-#  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
-#  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
-#  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
-#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
-#  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
-#  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
-#  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
-#  DATA.
-#
-
 import time
 import uuid
 from typing import Optional, Generator
@@ -86,7 +56,7 @@
 from app.services.query.query_configuration import QueryConfiguration
 
 
-def v3_chat(
+def stream_chat(
     session: Session,
     query: str,
     configuration: RagPredictConfiguration,

From cd960a1342a38c0450556f1f868c99e73181104f Mon Sep 17 00:00:00 2001
From: Elijah Williams <ewilliams@cloudera.com>
Date: Wed, 7 May 2025 11:21:27 -0600
Subject: [PATCH 41/41] nits

---
 llm-service/app/routers/index/sessions/__init__.py | 2 --
 llm-service/app/services/chat/streaming_chat.py    | 8 ++++++--
 ui/src/api/chatApi.ts                              | 5 ++---
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/llm-service/app/routers/index/sessions/__init__.py b/llm-service/app/routers/index/sessions/__init__.py
index 7b350e63..4223500c 100644
--- a/llm-service/app/routers/index/sessions/__init__.py
+++ b/llm-service/app/routers/index/sessions/__init__.py
@@ -241,6 +241,4 @@ def generate_stream() -> Generator[str, None, None]:
             logger.exception("Failed to stream chat completion")
             yield f'data: {{"error" : "{e}"}}\n\n'
 
-    # kick off evals with full response
-    # todo: write to history, start evals, rewrite question, log to mlfow once the response is done
     return StreamingResponse(generate_stream(), media_type="text/event-stream")
diff --git a/llm-service/app/services/chat/streaming_chat.py b/llm-service/app/services/chat/streaming_chat.py
index 5cdad4c5..ff173448 100644
--- a/llm-service/app/services/chat/streaming_chat.py
+++ b/llm-service/app/services/chat/streaming_chat.py
@@ -48,8 +48,12 @@
 from app.rag_types import RagPredictConfiguration
 from app.services import evaluators, llm_completion
 from app.services.chat.utils import retrieve_chat_history, format_source_nodes
-from app.services.chat_history.chat_history_manager import RagStudioChatMessage, RagMessage, Evaluation, \
-    chat_history_manager
+from app.services.chat_history.chat_history_manager import (
+    RagStudioChatMessage,
+    RagMessage,
+    Evaluation,
+    chat_history_manager,
+)
 from app.services.metadata_apis.session_metadata_api import Session
 from app.services.mlflow import record_rag_mlflow_run, record_direct_llm_mlflow_run
 from app.services.query import querier
diff --git a/ui/src/api/chatApi.ts b/ui/src/api/chatApi.ts
index 50938b02..7411d2f2 100644
--- a/ui/src/api/chatApi.ts
+++ b/ui/src/api/chatApi.ts
@@ -37,6 +37,7 @@
  ******************************************************************************/
 
 import {
+  commonHeaders,
   getRequest,
   llmServicePath,
   MutationKeys,
@@ -426,9 +427,7 @@ const streamChatMutation = async (
     `${llmServicePath}/sessions/${request.session_id.toString()}/stream-completion`,
     {
       method: "POST",
-      headers: {
-        "Content-Type": "application/json",
-      },
+      headers: commonHeaders,
       body: JSON.stringify({
         query: request.query,
         configuration: request.configuration,