From db3c2c4e1c83ac38daddda8c921757741c53a670 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jannik=20Maierh=C3=B6fer?= <jannik@langfuse.com>
Date: Tue, 17 Jun 2025 15:08:50 +0200
Subject: [PATCH] Fix: update notebook to langfuse sdk v3

---
 examples/agents_sdk/evaluate_agents.ipynb | 339 ++++++++++------------
 1 file changed, 153 insertions(+), 186 deletions(-)

diff --git a/examples/agents_sdk/evaluate_agents.ipynb b/examples/agents_sdk/evaluate_agents.ipynb
index 9f4c9861aa..26d0fd17bd 100644
--- a/examples/agents_sdk/evaluate_agents.ipynb
+++ b/examples/agents_sdk/evaluate_agents.ipynb
@@ -45,7 +45,7 @@
       "source": [
         "## Step 0: Install the Required Libraries\n",
         "\n",
-        "Below we install the `openai-agents` library (the OpenAI Agents SDK [link text](https://github.com/openai/openai-agents-python)), the `pydantic-ai[logfire]` OpenTelemetry instrumentation, `langfuse` and the Hugging Face `datasets` library"
+        "Below we install the `openai-agents` library (the [OpenAI Agents SDK](https://github.com/openai/openai-agents-python)), the `pydantic-ai[logfire]` OpenTelemetry instrumentation, `langfuse` and the Hugging Face `datasets` library"
       ]
     },
     {
@@ -61,11 +61,7 @@
       },
       "outputs": [],
       "source": [
-        "%pip install openai-agents\n",
-        "%pip install nest_asyncio\n",
-        "%pip install pydantic-ai[logfire]\n",
-        "%pip install langfuse\n",
-        "%pip install datasets"
+        "%pip install openai-agents nest_asyncio \"pydantic-ai[logfire]\" langfuse datasets"
       ]
     },
     {
@@ -93,44 +89,46 @@
         "import base64\n",
         "\n",
         "# Get keys for your project from the project settings page: https://cloud.langfuse.com\n",
-        "os.environ[\"LANGFUSE_PUBLIC_KEY\"] = \"pk-lf-...\"\n",
-        "os.environ[\"LANGFUSE_SECRET_KEY\"] = \"sk-lf-...\"\n",
+        "os.environ[\"LANGFUSE_PUBLIC_KEY\"] = \"pk-lf-...\" \n",
+        "os.environ[\"LANGFUSE_SECRET_KEY\"] = \"sk-lf-...\" \n",
         "os.environ[\"LANGFUSE_HOST\"] = \"https://cloud.langfuse.com\" # 🇪🇺 EU region\n",
         "# os.environ[\"LANGFUSE_HOST\"] = \"https://us.cloud.langfuse.com\" # 🇺🇸 US region\n",
         "\n",
+        "# Build Basic Auth header.\n",
         "LANGFUSE_AUTH = base64.b64encode(\n",
         "    f\"{os.environ.get('LANGFUSE_PUBLIC_KEY')}:{os.environ.get('LANGFUSE_SECRET_KEY')}\".encode()\n",
         ").decode()\n",
-        "\n",
+        " \n",
+        "# Configure OpenTelemetry endpoint & headers\n",
         "os.environ[\"OTEL_EXPORTER_OTLP_ENDPOINT\"] = os.environ.get(\"LANGFUSE_HOST\") + \"/api/public/otel\"\n",
         "os.environ[\"OTEL_EXPORTER_OTLP_HEADERS\"] = f\"Authorization=Basic {LANGFUSE_AUTH}\"\n",
         "\n",
-        "# Set your OpenAI API Key\n",
+        "# Your openai key\n",
         "os.environ[\"OPENAI_API_KEY\"] = \"sk-proj-...\""
       ]
     },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "With the environment variables set, we can now initialize the Langfuse client. `get_client()` initializes the Langfuse client using the credentials provided in the environment variables."
+      ]
+    },
     {
       "cell_type": "code",
       "execution_count": null,
-      "metadata": {
-        "id": "KQjuJNHkfoSP"
-      },
+      "metadata": {},
       "outputs": [],
       "source": [
-        "from opentelemetry.sdk.trace import TracerProvider\n",
-        "from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter\n",
-        "from opentelemetry.sdk.trace.export import SimpleSpanProcessor\n",
-        "\n",
-        "# Create a TracerProvider for OpenTelemetry\n",
-        "trace_provider = TracerProvider()\n",
-        "\n",
-        "# Add a SimpleSpanProcessor with the OTLPSpanExporter to send traces\n",
-        "trace_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter()))\n",
-        "\n",
-        "# Set the global default tracer provider\n",
-        "from opentelemetry import trace\n",
-        "trace.set_tracer_provider(trace_provider)\n",
-        "tracer = trace.get_tracer(__name__)"
+        "from langfuse import get_client\n",
+        " \n",
+        "langfuse = get_client()\n",
+        " \n",
+        "# Verify connection\n",
+        "if langfuse.auth_check():\n",
+        "    print(\"Langfuse client is authenticated and ready!\")\n",
+        "else:\n",
+        "    print(\"Authentication failed. Please check your credentials and host.\")"
       ]
     },
     {
@@ -144,7 +142,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 6,
+      "execution_count": 4,
       "metadata": {
         "id": "td11AsCShBxA"
       },
@@ -187,7 +185,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 10,
+      "execution_count": 8,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -200,30 +198,28 @@
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "12:01:03.401 OpenAI Agents trace: Agent workflow\n",
-            "12:01:03.403   Agent run: 'Assistant'\n",
-            "12:01:03.404     Responses API with 'gpt-4o'\n",
+            "13:00:52.784 OpenAI Agents trace: Agent workflow\n",
+            "13:00:52.787   Agent run: 'Assistant'\n",
+            "13:00:52.797     Responses API with 'gpt-4o'\n",
             "Evaluating AI agents is crucial for several reasons:\n",
             "\n",
-            "1. **Performance Verification**: Ensures that the AI performs its intended tasks accurately and efficiently, meeting the desired objectives and criteria.\n",
+            "1. **Performance Assessment**: It helps determine if the agent meets the desired goals and performs tasks effectively. By evaluating, we can assess accuracy, speed, and overall performance.\n",
             "\n",
-            "2. **Reliability and Consistency**: Assesses whether the AI provides consistent results across different scenarios and over time.\n",
+            "2. **Reliability and Consistency**: Regular evaluation ensures that the AI behaves consistently under different conditions and is reliable in production environments.\n",
             "\n",
-            "3. **Safety and Risk Management**: Identifies potential risks or harmful behaviors that could lead to undesirable outcomes, ensuring the AI operates safely within defined limits.\n",
+            "3. **Bias and Fairness**: Identifying and mitigating biases is essential for fair and ethical AI. Evaluation helps uncover any discriminatory patterns in the agent's behavior.\n",
             "\n",
-            "4. **Bias and Fairness**: Checks for any biases in the AI’s decision-making process to promote fairness and avoid discrimination against particular groups.\n",
+            "4. **Safety**: Evaluating AI agents ensures they operate safely and do not cause harm or unintended side effects, especially in critical applications.\n",
             "\n",
-            "5. **User Trust and Adoption**: Builds confidence and trust in the AI system among users and stakeholders, which is essential for widespread adoption.\n",
+            "5. **User Trust**: Proper evaluation builds trust with users and stakeholders by demonstrating that the AI is effective and aligned with expectations.\n",
             "\n",
-            "6. **Regulatory Compliance**: Ensures that the AI adheres to relevant laws, regulations, and ethical guidelines, which may vary by industry or region.\n",
+            "6. **Regulatory Compliance**: It ensures adherence to legal and ethical standards, which is increasingly important as regulations around AI evolve.\n",
             "\n",
-            "7. **Continuous Improvement**: Provides feedback that can be used to refine and improve the AI model over time, enhancing its effectiveness and efficiency.\n",
+            "7. **Continuous Improvement**: Ongoing evaluation provides insights that can be used to improve the agent over time, optimizing performance and adapting to new challenges.\n",
             "\n",
-            "8. **Integration and Compatibility**: Evaluates how well the AI integrates with existing systems and processes, ensuring compatibility and smooth operation.\n",
+            "8. **Resource Efficiency**: Evaluating helps ensure that the AI agent uses resources effectively, which can reduce costs and improve scalability.\n",
             "\n",
-            "9. **Resource Optimization**: Assesses the efficiency of the AI in terms of computational resources, which can lead to cost savings and improved performance.\n",
-            "\n",
-            "Evaluating AI agents systematically and rigorously supports their development and deployment in a responsible and effective manner.\n"
+            "In summary, evaluation is essential to ensure AI agents are effective, ethical, and aligned with user needs and societal norms.\n"
           ]
         }
       ],
@@ -241,7 +237,9 @@
         "    print(result.final_output)\n",
         "\n",
         "loop = asyncio.get_running_loop()\n",
-        "await loop.create_task(main())"
+        "await loop.create_task(main())\n",
+        "\n",
+        "langfuse.flush()"
       ]
     },
     {
@@ -272,7 +270,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 25,
+      "execution_count": 9,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -285,11 +283,11 @@
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "13:33:30.839 OpenAI Agents trace: Agent workflow\n",
-            "13:33:30.840   Agent run: 'Hello world'\n",
-            "13:33:30.842     Responses API with 'gpt-4o'\n",
-            "13:33:31.822     Function: get_weather\n",
-            "13:33:31.825     Responses API with 'gpt-4o'\n",
+            "13:01:15.351 OpenAI Agents trace: Agent workflow\n",
+            "13:01:15.355   Agent run: 'Hello world'\n",
+            "13:01:15.364     Responses API with 'gpt-4o'\n",
+            "13:01:15.999     Function: get_weather\n",
+            "13:01:16.000     Responses API with 'gpt-4o'\n",
             "The weather in Berlin is currently sunny.\n"
           ]
         }
@@ -396,37 +394,14 @@
       "source": [
         "#### 3. Additional Attributes\n",
         "\n",
-        "Opentelemetry lets you attach a set of attributes to all spans by setting [`set_attribute`](https://opentelemetry.io/docs/languages/python/instrumentation/#add-attributes-to-a-span). This allows you to set properties like a Langfuse Session ID, to group traces into Langfuse Sessions or a User ID, to assign traces to a specific user. You can find a list of all supported attributes in the [here](/docs/opentelemetry/get-started#property-mapping).\n",
-        "\n",
-        "In this example, we pass a [user_id](https://langfuse.com/docs/tracing-features/users), [session_id](https://langfuse.com/docs/tracing-features/sessions) and [trace_tags](https://langfuse.com/docs/tracing-features/tags) to Langfuse. You can also use the span attribute `input.value` and `output.value` to set the trace level input and output."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "CaSQNrgyfoSR"
-      },
-      "outputs": [],
-      "source": [
-        "from opentelemetry.sdk.trace import TracerProvider\n",
-        "from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter\n",
-        "from opentelemetry.sdk.trace.export import SimpleSpanProcessor\n",
-        "\n",
-        "trace_provider = TracerProvider()\n",
-        "trace_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter()))\n",
+        "Langfuse allows you to pass additional attributes to your spans. These can include `user_id`, `tags`, `session_id`, and custom `metadata`. Enriching traces with these details is important for analysis, debugging, and monitoring of your application's behavior across different users or sessions.\n",
         "\n",
-        "# Sets the global default tracer provider\n",
-        "from opentelemetry import trace\n",
-        "trace.set_tracer_provider(trace_provider)\n",
-        "\n",
-        "# Creates a tracer from the global tracer provider\n",
-        "tracer = trace.get_tracer(__name__)"
+        "In this example, we pass a [user_id](https://langfuse.com/docs/tracing-features/users), [session_id](https://langfuse.com/docs/tracing-features/sessions) and [trace_tags](https://langfuse.com/docs/tracing-features/tags) to Langfuse. "
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 27,
+      "execution_count": 10,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -439,52 +414,62 @@
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "13:34:49.654 OpenAI Agents trace: Agent workflow\n",
-            "13:34:49.655   Agent run: 'Assistant'\n",
-            "13:34:49.657     Responses API with 'gpt-4o'\n",
+            "13:02:41.552 OpenAI Agents trace: Agent workflow\n",
+            "13:02:41.553   Agent run: 'Assistant'\n",
+            "13:02:41.554     Responses API with 'gpt-4o'\n",
             "AI agent evaluation is crucial for several reasons:\n",
             "\n",
-            "1. **Performance Verification**: It ensures that the AI agent performs its intended tasks effectively and meets specific criteria or benchmarks.\n",
+            "1. **Performance Metrics**: It helps determine how well an AI agent performs its tasks, ensuring it meets the desired standards and objectives.\n",
             "\n",
-            "2. **Safety and Reliability**: Evaluation helps identify and mitigate risks, ensuring that the AI operates safely and reliably in real-world situations.\n",
+            "2. **Reliability and Safety**: Evaluation ensures the agent behaves consistently and safely in different scenarios, reducing risks of unintended consequences.\n",
             "\n",
-            "3. **Continuous Improvement**: Analyzing performance data allows developers to refine and enhance the AI, leading to better outcomes and more efficient systems.\n",
+            "3. **Bias Detection**: By evaluating AI agents, developers can identify and mitigate biases, ensuring fair and equitable outcomes for all users.\n",
             "\n",
-            "4. **Transparency and Accountability**: Thorough evaluation provides transparency into how decisions are made by the AI, which is essential for accountability, especially in sensitive applications.\n",
+            "4. **Benchmarking and Comparison**: Evaluation allows for the comparison of different AI models or versions, facilitating improvements and advancements.\n",
             "\n",
-            "5. **Bias and Fairness**: Evaluating AI systems helps detect and address potential biases, ensuring fair treatment of all users and stakeholders.\n",
+            "5. **User Trust**: Demonstrating the effectiveness and reliability of an AI agent builds trust with users, encouraging adoption and usage.\n",
             "\n",
-            "6. **Compliance**: It ensures adherence to regulations and industry standards, which is critical for legal and ethical compliance.\n",
+            "6. **Regulatory Compliance**: Proper evaluation helps ensure AI systems meet legal and regulatory requirements, which is especially important in sensitive domains like healthcare or finance.\n",
             "\n",
-            "7. **User Trust**: A well-evaluated AI fosters trust among users, stakeholders, and the public, as they can be confident in its capabilities and limitations.\n",
+            "7. **Scalability and Deployment**: Evaluation helps determine if an AI agent can scale effectively and function accurately in real-world environments.\n",
             "\n",
-            "8. **Resource Allocation**: Evaluation helps determine if the AI is using resources efficiently, which can be crucial for cost management and scalability.\n"
+            "Overall, AI agent evaluation is key to developing effective, trustworthy, and ethical AI systems.\n"
           ]
         }
       ],
       "source": [
         "input_query = \"Why is AI agent evaluation important?\"\n",
         "\n",
-        "with tracer.start_as_current_span(\"OpenAI-Agent-Trace\") as span:\n",
-        "    span.set_attribute(\"langfuse.user.id\", \"user-12345\")\n",
-        "    span.set_attribute(\"langfuse.session.id\", \"my-agent-session\")\n",
-        "    span.set_attribute(\"langfuse.tags\", [\"staging\", \"demo\", \"OpenAI Agent SDK\"])\n",
-        "\n",
+        "with langfuse.start_as_current_span(\n",
+        "    name=\"OpenAI-Agent-Trace\",\n",
+        "    ) as span:\n",
+        "    \n",
+        "    # Run your application here\n",
         "    async def main(input_query):\n",
-        "        agent = Agent(\n",
-        "            name = \"Assistant\",\n",
-        "            instructions = \"You are a helpful assistant.\",\n",
-        "        )\n",
+        "            agent = Agent(\n",
+        "                name = \"Assistant\",\n",
+        "                instructions = \"You are a helpful assistant.\",\n",
+        "            )\n",
         "\n",
-        "        result = await Runner.run(agent, input_query)\n",
-        "        print(result.final_output)\n",
-        "        return result\n",
+        "            result = await Runner.run(agent, input_query)\n",
+        "            print(result.final_output)\n",
+        "            return result\n",
         "\n",
         "    result = await main(input_query)\n",
-        "\n",
-        "    # Add input and output values to parent trace\n",
-        "    span.set_attribute(\"input.value\", input_query)\n",
-        "    span.set_attribute(\"output.value\", result.final_output)"
+        " \n",
+        "    # Pass additional attributes to the span\n",
+        "    span.update_trace(\n",
+        "        input=input_query,\n",
+        "        output=result,\n",
+        "        user_id=\"user_123\",\n",
+        "        session_id=\"my-agent-session\",\n",
+        "        tags=[\"staging\", \"demo\", \"OpenAI Agent SDK\"],\n",
+        "        metadata={\"email\": \"user@langfuse.com\"},\n",
+        "        version=\"1.0.0\"\n",
+        "        )\n",
+        " \n",
+        "# Flush events in short-lived applications\n",
+        "langfuse.flush()"
       ]
     },
     {
@@ -586,9 +571,9 @@
         "from opentelemetry.trace import format_trace_id\n",
         "import ipywidgets as widgets\n",
         "from IPython.display import display\n",
-        "from langfuse import Langfuse\n",
-        "\n",
-        "langfuse = Langfuse()\n",
+        "from langfuse import get_client\n",
+        " \n",
+        "langfuse = get_client()\n",
         "\n",
         "# Define your agent with the web search tool\n",
         "agent = Agent(\n",
@@ -597,39 +582,41 @@
         "    tools=[WebSearchTool()]\n",
         ")\n",
         "\n",
-        "formatted_trace_id = None  # We'll store the current trace_id globally for demonstration\n",
-        "\n",
         "def on_feedback(button):\n",
         "    if button.icon == \"thumbs-up\":\n",
-        "      langfuse.score(\n",
+        "      langfuse.create_score(\n",
         "            value=1,\n",
         "            name=\"user-feedback\",\n",
         "            comment=\"The user gave this response a thumbs up\",\n",
-        "            trace_id=formatted_trace_id\n",
+        "            trace_id=trace_id\n",
         "        )\n",
         "    elif button.icon == \"thumbs-down\":\n",
-        "      langfuse.score(\n",
+        "      langfuse.create_score(\n",
         "            value=0,\n",
         "            name=\"user-feedback\",\n",
         "            comment=\"The user gave this response a thumbs down\",\n",
-        "            trace_id=formatted_trace_id\n",
+        "            trace_id=trace_id\n",
         "        )\n",
         "    print(\"Scored the trace in Langfuse\")\n",
         "\n",
         "user_input = input(\"Enter your question: \")\n",
         "\n",
         "# Run agent\n",
-        "with trace.get_tracer(__name__).start_as_current_span(\"OpenAI-Agent-Trace\") as span:\n",
-        "\n",
-        "    # Run your agent with a query\n",
+        "with langfuse.start_as_current_span(\n",
+        "    name=\"OpenAI-Agent-Trace\",\n",
+        "    ) as span:\n",
+        "    \n",
+        "    # Run your application here\n",
         "    result = Runner.run_sync(agent, user_input)\n",
         "    print(result.final_output)\n",
         "\n",
-        "    current_span = trace.get_current_span()\n",
-        "    span_context = current_span.get_span_context()\n",
-        "    trace_id = span_context.trace_id\n",
-        "    formatted_trace_id = str(format_trace_id(trace_id))\n",
-        "    langfuse.trace(id=formatted_trace_id, input=user_input, output=result.final_output)\n",
+        "    result = await main(user_input)\n",
+        "    trace_id = langfuse.get_current_trace_id()\n",
+        "\n",
+        "    span.update_trace(\n",
+        "        input=user_input,\n",
+        "        output=result.final_output,\n",
+        "    )\n",
         "\n",
         "# Get feedback\n",
         "print(\"How did you like the agent response?\")\n",
@@ -640,7 +627,10 @@
         "thumbs_up.on_click(on_feedback)\n",
         "thumbs_down.on_click(on_feedback)\n",
         "\n",
-        "display(widgets.HBox([thumbs_up, thumbs_down]))"
+        "display(widgets.HBox([thumbs_up, thumbs_down]))\n",
+        "\n",
+        "# Flush events in short-lived applications\n",
+        "langfuse.flush()"
       ]
     },
     {
@@ -711,13 +701,15 @@
         "input_query = \"Is eating carrots good for the eyes?\"\n",
         "\n",
         "# Run agent\n",
-        "with trace.get_tracer(__name__).start_as_current_span(\"OpenAI-Agent-Trace\") as span:\n",
+        "with langfuse.start_as_current_span(name=\"OpenAI-Agent-Trace\") as span:\n",
         "    # Run your agent with a query\n",
         "    result = Runner.run_sync(agent, input_query)\n",
         "\n",
         "    # Add input and output values to parent trace\n",
-        "    span.set_attribute(\"input.value\", input_query)\n",
-        "    span.set_attribute(\"output.value\", result.final_output)"
+        "    span.update_trace(\n",
+        "        input=input_query,\n",
+        "        output=result.final_output,\n",
+        "    )"
       ]
     },
     {
@@ -956,8 +948,8 @@
         }
       ],
       "source": [
-        "from langfuse import Langfuse\n",
-        "langfuse = Langfuse()\n",
+        "from langfuse import get_client\n",
+        "langfuse = get_client()\n",
         "\n",
         "langfuse_dataset_name = \"search-dataset_huggingface_openai-agent\"\n",
         "\n",
@@ -1008,7 +1000,7 @@
         "#### Running the Agent on the Dataset\n",
         "\n",
         "We define a helper function `run_openai_agent()` that:\n",
-        "1. Starts an OpenTelemetry span\n",
+        "1. Starts a Langfuse span\n",
         "2. Runs our agent on the prompt\n",
         "3. Records the trace ID in Langfuse\n",
         "\n",
@@ -1017,78 +1009,53 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 60,
-      "metadata": {
-        "id": "-rYh1PBRfoSS"
-      },
+      "execution_count": null,
+      "metadata": {},
       "outputs": [],
       "source": [
         "from agents import Agent, Runner, WebSearchTool\n",
-        "from opentelemetry.trace import format_trace_id\n",
+        "from langfuse import get_client\n",
+        " \n",
+        "langfuse = get_client()\n",
+        "dataset_name = \"search-dataset_huggingface_openai-agent\"\n",
+        "current_run_name = \"qna_model_v3_run_05_20\" # Identifies this specific evaluation run\n",
         "\n",
-        "# Define your agent with the web search tool\n",
         "agent = Agent(\n",
         "    name=\"WebSearchAgent\",\n",
         "    instructions=\"You are an agent that can search the web.\",\n",
         "    tools=[WebSearchTool(search_context_size= \"high\")]\n",
         ")\n",
-        "\n",
+        " \n",
+        "# Assume 'run_openai_agent' is your instrumented application function\n",
         "def run_openai_agent(question):\n",
-        "    with tracer.start_as_current_span(\"OpenAI-Agent-Trace\") as span:\n",
-        "        span.set_attribute(\"langfuse.tag\", \"dataset-run\")\n",
-        "\n",
-        "        # Run your agent with a query\n",
+        "    with langfuse.start_as_current_generation(name=\"qna-llm-call\") as generation:\n",
+        "        # Simulate LLM call\n",
         "        result = Runner.run_sync(agent, question)\n",
-        "\n",
-        "        # Get the Langfuse trace_id to link the dataset run item to the agent trace\n",
-        "        current_span = trace.get_current_span()\n",
-        "        span_context = current_span.get_span_context()\n",
-        "        trace_id = span_context.trace_id\n",
-        "        formatted_trace_id = format_trace_id(trace_id)\n",
-        "\n",
-        "        langfuse_trace = langfuse.trace(\n",
-        "            id=formatted_trace_id,\n",
-        "            input=question,\n",
-        "            output=result.final_output\n",
+        " \n",
+        "        # Update the trace with the input and output\n",
+        "        generation.update_trace(\n",
+        "            input= question,\n",
+        "            output=result.final_output,\n",
         "        )\n",
-        "    return langfuse_trace, result.final_output"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "collapsed": true,
-        "id": "nF6JLCsYfoST",
-        "outputId": "84c3b74e-1aa3-4ef2-f285-f390095b03c0"
-      },
-      "outputs": [],
-      "source": [
-        "dataset = langfuse.get_dataset(langfuse_dataset_name)\n",
         "\n",
-        "# Run our agent against each dataset item\n",
+        "        return result.final_output\n",
+        " \n",
+        "dataset = langfuse.get_dataset(name=dataset_name) # Fetch your pre-populated dataset\n",
+        " \n",
         "for item in dataset.items:\n",
-        "    langfuse_trace, output = run_openai_agent(item.input[\"text\"])\n",
-        "\n",
-        "    # Link the trace to the dataset item for analysis\n",
-        "    item.link(\n",
-        "        langfuse_trace,\n",
-        "        run_name=\"openai-agent-run-03\",\n",
-        "        run_metadata={ \"search_context_size\": \"high\"}\n",
-        "    )\n",
-        "\n",
-        "    # Optionally, store a quick evaluation score for demonstration\n",
-        "    langfuse_trace.score(\n",
-        "        name=\"<example_eval>\",\n",
-        "        value=1,\n",
-        "        comment=\"This is a comment\"\n",
-        "    )\n",
-        "\n",
-        "# Flush data to ensure all telemetry is sent\n",
-        "langfuse.flush()"
+        " \n",
+        "    # Use the item.run() context manager\n",
+        "    with item.run(\n",
+        "        run_name=current_run_name,\n",
+        "        run_metadata={\"model_provider\": \"OpenAI\", \"temperature_setting\": 0.7},\n",
+        "        run_description=\"Evaluation run for Q&A model v3 on May 20th\"\n",
+        "    ) as root_span: # root_span is the root span of the new trace for this item and run.\n",
+        "        # All subsequent langfuse operations within this block are part of this trace.\n",
+        " \n",
+        "        # Call your application logic\n",
+        "        generated_answer = run_openai_agent(question=item.input[\"text\"])\n",
+        "\n",
+        "        print(item.input)"
       ]
     },
     {
@@ -1114,7 +1081,7 @@
       "provenance": []
     },
     "kernelspec": {
-      "display_name": "Python 3",
+      "display_name": ".venv",
       "language": "python",
       "name": "python3"
     },