diff --git a/index.toml b/index.toml
index 57fc13b..aafed4e 100644
--- a/index.toml
+++ b/index.toml
@@ -280,7 +280,6 @@ topics = ["Function Calling", "Agents"]
[[cookbook]]
title = "Extracting Metadata with an LLM"
notebook = "metadata_extraction_with_llm_metadata_extractor.ipynb"
-new = true
topics = ["Metadata"]
[[cookbook]]
@@ -316,3 +315,10 @@ new = true
experimental = true
topics = ["Multimodal"]
discuss = "https://github.com/deepset-ai/haystack-experimental/discussions/302"
+
+
+[[cookbook]]
+title = "Trace and Evaluate RAG with Arize Phoenix"
+notebook = "arize_phoenix_evaluate_haystack_rag.ipynb"
+topics = ["Observability", "Evaluation", "RAG"]
+new = true
\ No newline at end of file
diff --git a/notebooks/arize_phoenix_evaluate_haystack_rag.ipynb b/notebooks/arize_phoenix_evaluate_haystack_rag.ipynb
new file mode 100644
index 0000000..814d8d4
--- /dev/null
+++ b/notebooks/arize_phoenix_evaluate_haystack_rag.ipynb
@@ -0,0 +1,2109 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "4t3LXM0aNbl2"
+ },
+ "source": [
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " Docs\n",
+ " |\n",
+ " GitHub\n",
+ " |\n",
+ " Community\n",
+ "
\n",
+ "\n",
+ "\n",
+ "# Tracing and Evaluating a Haystack RAG Application with Phoenix\n",
+ "\n",
+ "[Phoenix](https://haystack.deepset.ai/integrations/arize-phoenix) is a tool for tracing and evaluating LLM applications. In this tutorial, we will trace and evaluate a Haystack RAG pipeline. We'll evaluate using three different types of evaluations:\n",
+ "\n",
+ "1. Relevance: Whether the retrieved documents are relevant to the question.\n",
+ "2. Q&A Correctness: Whether the answer to the question is correct.\n",
+ "3. Hallucination: Whether the answer contains hallucinations.\n",
+ "\n",
+ "ℹ️ This notebook requires an OpenAI API key.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "fGcvMui6fZIA"
+ },
+ "outputs": [],
+ "source": [
+ "!pip install -q openinference-instrumentation-haystack haystack-ai arize-phoenix"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "mqitn1QzOU5v"
+ },
+ "source": [
+ "## Set API Keys"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "rDg3U7kbOUX_",
+ "outputId": "91042f74-ff37-4122-bc08-62b4bda08d52"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "🔑 Enter your OpenAI API key: ··········\n"
+ ]
+ }
+ ],
+ "source": [
+ "from getpass import getpass\n",
+ "import os\n",
+ "\n",
+ "if not (openai_api_key := os.getenv(\"OPENAI_API_KEY\")):\n",
+ " openai_api_key = getpass(\"🔑 Enter your OpenAI API key: \")\n",
+ "\n",
+ "os.environ[\"OPENAI_API_KEY\"] = openai_api_key"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "qlWmN0pvpJCG"
+ },
+ "source": [
+ "## Launch Phoenix and Enable Haystack Tracing\n",
+ "\n",
+ "If you don't have a Phoenix API key, you can get one for free at [phoenix.arize.com](https://phoenix.arize.com). Arize Phoenix also provides [self-hosting options](https://docs.arize.com/phoenix/self-hosting) if you'd prefer to run the application yourself instead."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "3OhQrnlsfgdN",
+ "outputId": "885406fb-d8a4-47e4-c978-652e66530468"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Enter your Phoenix API Key··········\n"
+ ]
+ }
+ ],
+ "source": [
+ "if os.getenv(\"PHOENIX_API_KEY\") is None:\n",
+ " os.environ[\"PHOENIX_API_KEY\"] = getpass(\"Enter your Phoenix API Key\")\n",
+ "\n",
+ "os.environ[\"OTEL_EXPORTER_OTLP_HEADERS\"] = f\"api_key={os.environ['PHOENIX_API_KEY']}\"\n",
+ "os.environ[\"PHOENIX_CLIENT_HEADERS\"] = f\"api_key={os.environ['PHOENIX_API_KEY']}\"\n",
+ "os.environ[\"PHOENIX_COLLECTOR_ENDPOINT\"] = \"https://app.phoenix.arize.com\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "Yy6RK4lbcxVy"
+ },
+ "source": [
+ "The command below connects Phoenix to your Haystack application and instruments the Haystack library. Any calls to Haystack pipelines from this point forward will be traced and logged to the Phoenix UI."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "Jw2O_ydEcxVy"
+ },
+ "outputs": [],
+ "source": [
+ "from phoenix.otel import register\n",
+ "\n",
+ "project_name = \"Haystack RAG\"\n",
+ "tracer_provider = register(project_name=project_name, auto_instrument=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "twBLgY1LpMPW"
+ },
+ "source": [
+ "## Set up your Haystack app\n",
+ "\n",
+ "For a step-by-step guide to create a RAG pipeline with Haystack, follow the [Creating Your First QA Pipeline with Retrieval-Augmentation](https://haystack.deepset.ai/tutorials/27_first_rag_pipeline) tutorial."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "gMQqmOz0dY3x",
+ "outputId": "e658a8c9-eccb-462c-d37e-2782fb07cb9c"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "\n",
+ "🚅 Components\n",
+ " - retriever: InMemoryBM25Retriever\n",
+ " - prompt_builder: ChatPromptBuilder\n",
+ " - llm: OpenAIChatGenerator\n",
+ "🛤️ Connections\n",
+ " - retriever.documents -> prompt_builder.documents (List[Document])\n",
+ " - prompt_builder.prompt -> llm.messages (List[ChatMessage])"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from haystack.components.builders import ChatPromptBuilder\n",
+ "from haystack.dataclasses import ChatMessage, Document\n",
+ "from haystack.components.generators.chat import OpenAIChatGenerator\n",
+ "from haystack.document_stores.in_memory import InMemoryDocumentStore\n",
+ "from haystack.components.retrievers.in_memory import InMemoryBM25Retriever\n",
+ "from haystack import Pipeline\n",
+ "\n",
+ "# Write documents to InMemoryDocumentStore\n",
+ "document_store = InMemoryDocumentStore()\n",
+ "document_store.write_documents(\n",
+ " [\n",
+ " Document(content=\"My name is Jean and I live in Paris.\"),\n",
+ " Document(content=\"My name is Mark and I live in Berlin.\"),\n",
+ " Document(content=\"My name is Giorgio and I live in Rome.\"),\n",
+ " ]\n",
+ ")\n",
+ "\n",
+ "# Basic RAG Pipeline\n",
+ "template = [\n",
+ " ChatMessage.from_system(\n",
+ "\"\"\"\n",
+ "Answer the questions based on the given context.\n",
+ "\n",
+ "Context:\n",
+ "{% for document in documents %}\n",
+ " {{ document.content }}\n",
+ "{% endfor %}\n",
+ "Question: {{ question }}\n",
+ "Answer:\n",
+ "\"\"\"\n",
+ " )\n",
+ "]\n",
+ "rag_pipe = Pipeline()\n",
+ "rag_pipe.add_component(\"retriever\", InMemoryBM25Retriever(document_store=document_store))\n",
+ "rag_pipe.add_component(\"prompt_builder\", ChatPromptBuilder(template=template, required_variables=\"*\"))\n",
+ "rag_pipe.add_component(\"llm\", OpenAIChatGenerator(model=\"gpt-4o-mini\"))\n",
+ "\n",
+ "rag_pipe.connect(\"retriever\", \"prompt_builder.documents\")\n",
+ "rag_pipe.connect(\"prompt_builder.prompt\", \"llm.messages\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Run the pipeline with a query. It will automatically create a trace on Phoenix."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "ji06yJ2Bfmx9",
+ "outputId": "d3a1b646-7c22-4272-adc8-d4a1a3d2e454"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Jean lives in Paris.\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Ask a question\n",
+ "question = \"Who lives in Paris?\"\n",
+ "results = rag_pipe.run(\n",
+ " {\n",
+ " \"retriever\": {\"query\": question},\n",
+ " \"prompt_builder\": {\"question\": question},\n",
+ " }\n",
+ ")\n",
+ "\n",
+ "print(results[\"llm\"][\"replies\"][0].text)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "KDsd4qJIXfZv"
+ },
+ "source": [
+ "## Evaluating Retrieved Docs\n",
+ "\n",
+ "Now that we've traced our pipeline, let's start by evaluating the retrieved documents.\n",
+ "\n",
+ "All evaluations in Phoenix use the same general process:\n",
+ "1. Query and download trace data from Phoenix\n",
+ "2. Add evaluation labels to the trace data. This can be done using the Phoenix library, using Haystack evaluators, or using your own evaluators.\n",
+ "3. Log the evaluation labels to Phoenix\n",
+ "4. View evaluations\n",
+ "\n",
+ "We'll use the `get_retrieved_documents` function to get the trace data for the retrieved documents."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "_0XDNCW3YFCz",
+ "outputId": "c8d30639-e4be-4064-acb0-e3b7825c663d"
+ },
+ "outputs": [],
+ "source": [
+ "import nest_asyncio\n",
+ "nest_asyncio.apply()\n",
+ "\n",
+ "import phoenix as px\n",
+ "client = px.Client()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 211
+ },
+ "id": "FT9NbFomYYoX",
+ "outputId": "f46f3a3d-1290-4a14-fa9c-4adce0a6c4c9"
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/usr/local/lib/python3.11/dist-packages/phoenix/utilities/client.py:60: UserWarning: The Phoenix server (10.9.1) and client (10.11.0) versions are mismatched and may have compatibility issues.\n",
+ " warnings.warn(\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "summary": "{\n \"name\": \"retrieved_documents_df\",\n \"rows\": 3,\n \"fields\": [\n {\n \"column\": \"context.trace_id\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"53d4a3ef151e2dc3009fa6aff152dc86\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"input\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"{\\\"query\\\": \\\"Who lives in Paris?\\\", \\\"filters\\\": null, \\\"top_k\\\": null, \\\"scale_score\\\": null}\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"reference\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"My name is Jean and I live in Paris.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"document_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.30336537496698407,\n \"min\": 0.7680100781,\n \"max\": 1.2934543208,\n \"num_unique_values\": 2,\n \"samples\": [\n 0.7680100781\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}",
+ "type": "dataframe",
+ "variable_name": "retrieved_documents_df"
+ },
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " | \n",
+ " context.trace_id | \n",
+ " input | \n",
+ " reference | \n",
+ " document_score | \n",
+ "
\n",
+ " \n",
+ " context.span_id | \n",
+ " document_position | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 40880a3ade3753c3 | \n",
+ " 0 | \n",
+ " 53d4a3ef151e2dc3009fa6aff152dc86 | \n",
+ " {\"query\": \"Who lives in Paris?\", \"filters\": nu... | \n",
+ " My name is Jean and I live in Paris. | \n",
+ " 1.293454 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 53d4a3ef151e2dc3009fa6aff152dc86 | \n",
+ " {\"query\": \"Who lives in Paris?\", \"filters\": nu... | \n",
+ " My name is Mark and I live in Berlin. | \n",
+ " 0.768010 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 53d4a3ef151e2dc3009fa6aff152dc86 | \n",
+ " {\"query\": \"Who lives in Paris?\", \"filters\": nu... | \n",
+ " My name is Giorgio and I live in Rome. | \n",
+ " 0.768010 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "text/plain": [
+ " context.trace_id \\\n",
+ "context.span_id document_position \n",
+ "40880a3ade3753c3 0 53d4a3ef151e2dc3009fa6aff152dc86 \n",
+ " 1 53d4a3ef151e2dc3009fa6aff152dc86 \n",
+ " 2 53d4a3ef151e2dc3009fa6aff152dc86 \n",
+ "\n",
+ " input \\\n",
+ "context.span_id document_position \n",
+ "40880a3ade3753c3 0 {\"query\": \"Who lives in Paris?\", \"filters\": nu... \n",
+ " 1 {\"query\": \"Who lives in Paris?\", \"filters\": nu... \n",
+ " 2 {\"query\": \"Who lives in Paris?\", \"filters\": nu... \n",
+ "\n",
+ " reference \\\n",
+ "context.span_id document_position \n",
+ "40880a3ade3753c3 0 My name is Jean and I live in Paris. \n",
+ " 1 My name is Mark and I live in Berlin. \n",
+ " 2 My name is Giorgio and I live in Rome. \n",
+ "\n",
+ " document_score \n",
+ "context.span_id document_position \n",
+ "40880a3ade3753c3 0 1.293454 \n",
+ " 1 0.768010 \n",
+ " 2 0.768010 "
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from phoenix.session.evaluation import get_retrieved_documents\n",
+ "\n",
+ "retrieved_documents_df = get_retrieved_documents(px.Client(), project_name=project_name)\n",
+ "retrieved_documents_df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "9tT6N2FrcxVz"
+ },
+ "source": [
+ "Next we'll use Phoenix's `RelevanceEvaluator` to evaluate the relevance of the retrieved documents. This evaluator uses a LLM to determine if the retrieved documents contain the answer to the question."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 49,
+ "referenced_widgets": [
+ "8630250115714db88b008107217b975d",
+ "bf4db431987446d0a65e8255c1278114",
+ "2516fdc2953b43ada41eb8a7094dccf0",
+ "e46c7ba988854d2ba8eb2b3bbda31fdb",
+ "5831127093444b1e9ad6831c1c0acd82",
+ "257a037997734a3e86b92244e22a1b32",
+ "10bc0bed1e384042b61551f609774e39",
+ "a77ba70463594f1da34de83faf8bf143",
+ "248b80441b3d47fcb3811a35047380be",
+ "5e9a221077e34a2dbce71ee33ff978ba",
+ "11e300004afe448fb9779c9843f73d7b"
+ ]
+ },
+ "id": "RsAJdoFWYZzk",
+ "outputId": "ab47d7fa-c902-4198-87a1-2ad2dab23d74"
+ },
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "8630250115714db88b008107217b975d",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "run_evals | | 0/3 (0.0%) | ⏳ 00:00 | ?it/s"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "from phoenix.evals import OpenAIModel, RelevanceEvaluator, run_evals\n",
+ "\n",
+ "relevance_evaluator = RelevanceEvaluator(OpenAIModel(model=\"gpt-4o-mini\"))\n",
+ "\n",
+ "retrieved_documents_relevance_df = run_evals(\n",
+ " evaluators=[relevance_evaluator],\n",
+ " dataframe=retrieved_documents_df,\n",
+ " provide_explanation=True,\n",
+ " concurrency=20,\n",
+ ")[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 175
+ },
+ "id": "p0ANPW35Xhkx",
+ "outputId": "7b064148-3714-4b7f-ccb8-93275bf1a057"
+ },
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "summary": "{\n \"name\": \"retrieved_documents_relevance_df\",\n \"rows\": 3,\n \"fields\": [\n {\n \"column\": \"label\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"unrelated\",\n \"relevant\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 0,\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"explanation\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"The question asks who lives in Paris. The reference text states, 'My name is Jean and I live in Paris.' This directly provides the information that Jean lives in Paris, which answers the question. Therefore, the reference text is relevant to the question.\",\n \"The question asks about who lives in Paris, which is a specific location. The reference text states that the speaker, Mark, lives in Berlin. Since Berlin is not Paris, the information provided does not help answer the question about who lives in Paris. Therefore, the reference text is unrelated to the question.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}",
+ "type": "dataframe",
+ "variable_name": "retrieved_documents_relevance_df"
+ },
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " | \n",
+ " label | \n",
+ " score | \n",
+ " explanation | \n",
+ "
\n",
+ " \n",
+ " context.span_id | \n",
+ " document_position | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 40880a3ade3753c3 | \n",
+ " 0 | \n",
+ " relevant | \n",
+ " 1 | \n",
+ " The question asks who lives in Paris. The refe... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " unrelated | \n",
+ " 0 | \n",
+ " The question asks about who lives in Paris, wh... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " unrelated | \n",
+ " 0 | \n",
+ " The question asks about who lives in Paris, wh... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "text/plain": [
+ " label score \\\n",
+ "context.span_id document_position \n",
+ "40880a3ade3753c3 0 relevant 1 \n",
+ " 1 unrelated 0 \n",
+ " 2 unrelated 0 \n",
+ "\n",
+ " explanation \n",
+ "context.span_id document_position \n",
+ "40880a3ade3753c3 0 The question asks who lives in Paris. The refe... \n",
+ " 1 The question asks about who lives in Paris, wh... \n",
+ " 2 The question asks about who lives in Paris, wh... "
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "retrieved_documents_relevance_df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "RtW1gy5IcxVz"
+ },
+ "source": [
+ "Finally, we'll log the evaluation labels to Phoenix."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "TSwP858Wb7Qj"
+ },
+ "outputs": [],
+ "source": [
+ "from phoenix.trace import DocumentEvaluations, SpanEvaluations\n",
+ "\n",
+ "px.Client().log_evaluations(\n",
+ " DocumentEvaluations(dataframe=retrieved_documents_relevance_df, eval_name=\"relevance\"),\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "7DGCDLDCcxVz"
+ },
+ "source": [
+ "If you now click on your document retrieval span in Phoenix, you should see the evaluation labels.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "lDiG7BdwgUpw"
+ },
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "O5JBstOntoJx"
+ },
+ "source": [
+ "## Evaluate Response\n",
+ "\n",
+ "With `HallucinationEvaluator` and `QAEvaluator`, we can detect the correctness and hallucination score of the generated response."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 148
+ },
+ "id": "WX0iCQJWtwQQ",
+ "outputId": "ec18105e-d593-4107-96f6-e95c30ff1d82"
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/usr/local/lib/python3.11/dist-packages/phoenix/utilities/client.py:60: UserWarning: The Phoenix server (10.9.1) and client (10.11.0) versions are mismatched and may have compatibility issues.\n",
+ " warnings.warn(\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "summary": "{\n \"name\": \"qa_with_reference_df\",\n \"rows\": 1,\n \"fields\": [\n {\n \"column\": \"context.span_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"a3e33d1e526e97bd\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"input\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"{\\\"data\\\": {\\\"retriever\\\": {\\\"query\\\": \\\"Who lives in Paris?\\\"}, \\\"prompt_builder\\\": {\\\"question\\\": \\\"Who lives in Paris?\\\"}}}\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"output\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"{\\\"llm\\\": {\\\"replies\\\": [\\\"ChatMessage(_role=, _content=[TextContent(text='Jean lives in Paris.')], _name=None, _meta={'model': 'gpt-4o-mini-2024-07-18', 'index': 0, 'finish_reason': 'stop', 'usage': {'completion_tokens': 5, 'prompt_tokens': 61, 'total_tokens': 66, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}})\\\"]}}\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"reference\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"My name is Jean and I live in Paris.\\n\\nMy name is Mark and I live in Berlin.\\n\\nMy name is Giorgio and I live in Rome.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}",
+ "type": "dataframe",
+ "variable_name": "qa_with_reference_df"
+ },
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " input | \n",
+ " output | \n",
+ " reference | \n",
+ "
\n",
+ " \n",
+ " context.span_id | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " a3e33d1e526e97bd | \n",
+ " {\"data\": {\"retriever\": {\"query\": \"Who lives in... | \n",
+ " {\"llm\": {\"replies\": [\"ChatMessage(_role=<ChatR... | \n",
+ " My name is Jean and I live in Paris.\\n\\nMy nam... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "text/plain": [
+ " input \\\n",
+ "context.span_id \n",
+ "a3e33d1e526e97bd {\"data\": {\"retriever\": {\"query\": \"Who lives in... \n",
+ "\n",
+ " output \\\n",
+ "context.span_id \n",
+ "a3e33d1e526e97bd {\"llm\": {\"replies\": [\"ChatMessage(_role=