From db3c2c4e1c83ac38daddda8c921757741c53a670 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jannik=20Maierh=C3=B6fer?= Date: Tue, 17 Jun 2025 15:08:50 +0200 Subject: [PATCH] Fix: update notebook to langfuse sdk v3 --- examples/agents_sdk/evaluate_agents.ipynb | 339 ++++++++++------------ 1 file changed, 153 insertions(+), 186 deletions(-) diff --git a/examples/agents_sdk/evaluate_agents.ipynb b/examples/agents_sdk/evaluate_agents.ipynb index 9f4c9861aa..26d0fd17bd 100644 --- a/examples/agents_sdk/evaluate_agents.ipynb +++ b/examples/agents_sdk/evaluate_agents.ipynb @@ -45,7 +45,7 @@ "source": [ "## Step 0: Install the Required Libraries\n", "\n", - "Below we install the `openai-agents` library (the OpenAI Agents SDK [link text](https://github.com/openai/openai-agents-python)), the `pydantic-ai[logfire]` OpenTelemetry instrumentation, `langfuse` and the Hugging Face `datasets` library" + "Below we install the `openai-agents` library (the [OpenAI Agents SDK](https://github.com/openai/openai-agents-python)), the `pydantic-ai[logfire]` OpenTelemetry instrumentation, `langfuse` and the Hugging Face `datasets` library" ] }, { @@ -61,11 +61,7 @@ }, "outputs": [], "source": [ - "%pip install openai-agents\n", - "%pip install nest_asyncio\n", - "%pip install pydantic-ai[logfire]\n", - "%pip install langfuse\n", - "%pip install datasets" + "%pip install openai-agents nest_asyncio \"pydantic-ai[logfire]\" langfuse datasets" ] }, { @@ -93,44 +89,46 @@ "import base64\n", "\n", "# Get keys for your project from the project settings page: https://cloud.langfuse.com\n", - "os.environ[\"LANGFUSE_PUBLIC_KEY\"] = \"pk-lf-...\"\n", - "os.environ[\"LANGFUSE_SECRET_KEY\"] = \"sk-lf-...\"\n", + "os.environ[\"LANGFUSE_PUBLIC_KEY\"] = \"pk-lf-...\" \n", + "os.environ[\"LANGFUSE_SECRET_KEY\"] = \"sk-lf-...\" \n", "os.environ[\"LANGFUSE_HOST\"] = \"https://cloud.langfuse.com\" # 🇪🇺 EU region\n", "# os.environ[\"LANGFUSE_HOST\"] = \"https://us.cloud.langfuse.com\" # 🇺🇸 US region\n", "\n", + "# Build Basic Auth header.\n", "LANGFUSE_AUTH = base64.b64encode(\n", " f\"{os.environ.get('LANGFUSE_PUBLIC_KEY')}:{os.environ.get('LANGFUSE_SECRET_KEY')}\".encode()\n", ").decode()\n", - "\n", + " \n", + "# Configure OpenTelemetry endpoint & headers\n", "os.environ[\"OTEL_EXPORTER_OTLP_ENDPOINT\"] = os.environ.get(\"LANGFUSE_HOST\") + \"/api/public/otel\"\n", "os.environ[\"OTEL_EXPORTER_OTLP_HEADERS\"] = f\"Authorization=Basic {LANGFUSE_AUTH}\"\n", "\n", - "# Set your OpenAI API Key\n", + "# Your openai key\n", "os.environ[\"OPENAI_API_KEY\"] = \"sk-proj-...\"" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With the environment variables set, we can now initialize the Langfuse client. `get_client()` initializes the Langfuse client using the credentials provided in the environment variables." + ] + }, { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "KQjuJNHkfoSP" - }, + "metadata": {}, "outputs": [], "source": [ - "from opentelemetry.sdk.trace import TracerProvider\n", - "from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter\n", - "from opentelemetry.sdk.trace.export import SimpleSpanProcessor\n", - "\n", - "# Create a TracerProvider for OpenTelemetry\n", - "trace_provider = TracerProvider()\n", - "\n", - "# Add a SimpleSpanProcessor with the OTLPSpanExporter to send traces\n", - "trace_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter()))\n", - "\n", - "# Set the global default tracer provider\n", - "from opentelemetry import trace\n", - "trace.set_tracer_provider(trace_provider)\n", - "tracer = trace.get_tracer(__name__)" + "from langfuse import get_client\n", + " \n", + "langfuse = get_client()\n", + " \n", + "# Verify connection\n", + "if langfuse.auth_check():\n", + " print(\"Langfuse client is authenticated and ready!\")\n", + "else:\n", + " print(\"Authentication failed. Please check your credentials and host.\")" ] }, { @@ -144,7 +142,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": { "id": "td11AsCShBxA" }, @@ -187,7 +185,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -200,30 +198,28 @@ "name": "stdout", "output_type": "stream", "text": [ - "12:01:03.401 OpenAI Agents trace: Agent workflow\n", - "12:01:03.403 Agent run: 'Assistant'\n", - "12:01:03.404 Responses API with 'gpt-4o'\n", + "13:00:52.784 OpenAI Agents trace: Agent workflow\n", + "13:00:52.787 Agent run: 'Assistant'\n", + "13:00:52.797 Responses API with 'gpt-4o'\n", "Evaluating AI agents is crucial for several reasons:\n", "\n", - "1. **Performance Verification**: Ensures that the AI performs its intended tasks accurately and efficiently, meeting the desired objectives and criteria.\n", + "1. **Performance Assessment**: It helps determine if the agent meets the desired goals and performs tasks effectively. By evaluating, we can assess accuracy, speed, and overall performance.\n", "\n", - "2. **Reliability and Consistency**: Assesses whether the AI provides consistent results across different scenarios and over time.\n", + "2. **Reliability and Consistency**: Regular evaluation ensures that the AI behaves consistently under different conditions and is reliable in production environments.\n", "\n", - "3. **Safety and Risk Management**: Identifies potential risks or harmful behaviors that could lead to undesirable outcomes, ensuring the AI operates safely within defined limits.\n", + "3. **Bias and Fairness**: Identifying and mitigating biases is essential for fair and ethical AI. Evaluation helps uncover any discriminatory patterns in the agent's behavior.\n", "\n", - "4. **Bias and Fairness**: Checks for any biases in the AI’s decision-making process to promote fairness and avoid discrimination against particular groups.\n", + "4. **Safety**: Evaluating AI agents ensures they operate safely and do not cause harm or unintended side effects, especially in critical applications.\n", "\n", - "5. **User Trust and Adoption**: Builds confidence and trust in the AI system among users and stakeholders, which is essential for widespread adoption.\n", + "5. **User Trust**: Proper evaluation builds trust with users and stakeholders by demonstrating that the AI is effective and aligned with expectations.\n", "\n", - "6. **Regulatory Compliance**: Ensures that the AI adheres to relevant laws, regulations, and ethical guidelines, which may vary by industry or region.\n", + "6. **Regulatory Compliance**: It ensures adherence to legal and ethical standards, which is increasingly important as regulations around AI evolve.\n", "\n", - "7. **Continuous Improvement**: Provides feedback that can be used to refine and improve the AI model over time, enhancing its effectiveness and efficiency.\n", + "7. **Continuous Improvement**: Ongoing evaluation provides insights that can be used to improve the agent over time, optimizing performance and adapting to new challenges.\n", "\n", - "8. **Integration and Compatibility**: Evaluates how well the AI integrates with existing systems and processes, ensuring compatibility and smooth operation.\n", + "8. **Resource Efficiency**: Evaluating helps ensure that the AI agent uses resources effectively, which can reduce costs and improve scalability.\n", "\n", - "9. **Resource Optimization**: Assesses the efficiency of the AI in terms of computational resources, which can lead to cost savings and improved performance.\n", - "\n", - "Evaluating AI agents systematically and rigorously supports their development and deployment in a responsible and effective manner.\n" + "In summary, evaluation is essential to ensure AI agents are effective, ethical, and aligned with user needs and societal norms.\n" ] } ], @@ -241,7 +237,9 @@ " print(result.final_output)\n", "\n", "loop = asyncio.get_running_loop()\n", - "await loop.create_task(main())" + "await loop.create_task(main())\n", + "\n", + "langfuse.flush()" ] }, { @@ -272,7 +270,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -285,11 +283,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "13:33:30.839 OpenAI Agents trace: Agent workflow\n", - "13:33:30.840 Agent run: 'Hello world'\n", - "13:33:30.842 Responses API with 'gpt-4o'\n", - "13:33:31.822 Function: get_weather\n", - "13:33:31.825 Responses API with 'gpt-4o'\n", + "13:01:15.351 OpenAI Agents trace: Agent workflow\n", + "13:01:15.355 Agent run: 'Hello world'\n", + "13:01:15.364 Responses API with 'gpt-4o'\n", + "13:01:15.999 Function: get_weather\n", + "13:01:16.000 Responses API with 'gpt-4o'\n", "The weather in Berlin is currently sunny.\n" ] } @@ -396,37 +394,14 @@ "source": [ "#### 3. Additional Attributes\n", "\n", - "Opentelemetry lets you attach a set of attributes to all spans by setting [`set_attribute`](https://opentelemetry.io/docs/languages/python/instrumentation/#add-attributes-to-a-span). This allows you to set properties like a Langfuse Session ID, to group traces into Langfuse Sessions or a User ID, to assign traces to a specific user. You can find a list of all supported attributes in the [here](/docs/opentelemetry/get-started#property-mapping).\n", - "\n", - "In this example, we pass a [user_id](https://langfuse.com/docs/tracing-features/users), [session_id](https://langfuse.com/docs/tracing-features/sessions) and [trace_tags](https://langfuse.com/docs/tracing-features/tags) to Langfuse. You can also use the span attribute `input.value` and `output.value` to set the trace level input and output." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "CaSQNrgyfoSR" - }, - "outputs": [], - "source": [ - "from opentelemetry.sdk.trace import TracerProvider\n", - "from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter\n", - "from opentelemetry.sdk.trace.export import SimpleSpanProcessor\n", - "\n", - "trace_provider = TracerProvider()\n", - "trace_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter()))\n", + "Langfuse allows you to pass additional attributes to your spans. These can include `user_id`, `tags`, `session_id`, and custom `metadata`. Enriching traces with these details is important for analysis, debugging, and monitoring of your application's behavior across different users or sessions.\n", "\n", - "# Sets the global default tracer provider\n", - "from opentelemetry import trace\n", - "trace.set_tracer_provider(trace_provider)\n", - "\n", - "# Creates a tracer from the global tracer provider\n", - "tracer = trace.get_tracer(__name__)" + "In this example, we pass a [user_id](https://langfuse.com/docs/tracing-features/users), [session_id](https://langfuse.com/docs/tracing-features/sessions) and [trace_tags](https://langfuse.com/docs/tracing-features/tags) to Langfuse. " ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -439,52 +414,62 @@ "name": "stdout", "output_type": "stream", "text": [ - "13:34:49.654 OpenAI Agents trace: Agent workflow\n", - "13:34:49.655 Agent run: 'Assistant'\n", - "13:34:49.657 Responses API with 'gpt-4o'\n", + "13:02:41.552 OpenAI Agents trace: Agent workflow\n", + "13:02:41.553 Agent run: 'Assistant'\n", + "13:02:41.554 Responses API with 'gpt-4o'\n", "AI agent evaluation is crucial for several reasons:\n", "\n", - "1. **Performance Verification**: It ensures that the AI agent performs its intended tasks effectively and meets specific criteria or benchmarks.\n", + "1. **Performance Metrics**: It helps determine how well an AI agent performs its tasks, ensuring it meets the desired standards and objectives.\n", "\n", - "2. **Safety and Reliability**: Evaluation helps identify and mitigate risks, ensuring that the AI operates safely and reliably in real-world situations.\n", + "2. **Reliability and Safety**: Evaluation ensures the agent behaves consistently and safely in different scenarios, reducing risks of unintended consequences.\n", "\n", - "3. **Continuous Improvement**: Analyzing performance data allows developers to refine and enhance the AI, leading to better outcomes and more efficient systems.\n", + "3. **Bias Detection**: By evaluating AI agents, developers can identify and mitigate biases, ensuring fair and equitable outcomes for all users.\n", "\n", - "4. **Transparency and Accountability**: Thorough evaluation provides transparency into how decisions are made by the AI, which is essential for accountability, especially in sensitive applications.\n", + "4. **Benchmarking and Comparison**: Evaluation allows for the comparison of different AI models or versions, facilitating improvements and advancements.\n", "\n", - "5. **Bias and Fairness**: Evaluating AI systems helps detect and address potential biases, ensuring fair treatment of all users and stakeholders.\n", + "5. **User Trust**: Demonstrating the effectiveness and reliability of an AI agent builds trust with users, encouraging adoption and usage.\n", "\n", - "6. **Compliance**: It ensures adherence to regulations and industry standards, which is critical for legal and ethical compliance.\n", + "6. **Regulatory Compliance**: Proper evaluation helps ensure AI systems meet legal and regulatory requirements, which is especially important in sensitive domains like healthcare or finance.\n", "\n", - "7. **User Trust**: A well-evaluated AI fosters trust among users, stakeholders, and the public, as they can be confident in its capabilities and limitations.\n", + "7. **Scalability and Deployment**: Evaluation helps determine if an AI agent can scale effectively and function accurately in real-world environments.\n", "\n", - "8. **Resource Allocation**: Evaluation helps determine if the AI is using resources efficiently, which can be crucial for cost management and scalability.\n" + "Overall, AI agent evaluation is key to developing effective, trustworthy, and ethical AI systems.\n" ] } ], "source": [ "input_query = \"Why is AI agent evaluation important?\"\n", "\n", - "with tracer.start_as_current_span(\"OpenAI-Agent-Trace\") as span:\n", - " span.set_attribute(\"langfuse.user.id\", \"user-12345\")\n", - " span.set_attribute(\"langfuse.session.id\", \"my-agent-session\")\n", - " span.set_attribute(\"langfuse.tags\", [\"staging\", \"demo\", \"OpenAI Agent SDK\"])\n", - "\n", + "with langfuse.start_as_current_span(\n", + " name=\"OpenAI-Agent-Trace\",\n", + " ) as span:\n", + " \n", + " # Run your application here\n", " async def main(input_query):\n", - " agent = Agent(\n", - " name = \"Assistant\",\n", - " instructions = \"You are a helpful assistant.\",\n", - " )\n", + " agent = Agent(\n", + " name = \"Assistant\",\n", + " instructions = \"You are a helpful assistant.\",\n", + " )\n", "\n", - " result = await Runner.run(agent, input_query)\n", - " print(result.final_output)\n", - " return result\n", + " result = await Runner.run(agent, input_query)\n", + " print(result.final_output)\n", + " return result\n", "\n", " result = await main(input_query)\n", - "\n", - " # Add input and output values to parent trace\n", - " span.set_attribute(\"input.value\", input_query)\n", - " span.set_attribute(\"output.value\", result.final_output)" + " \n", + " # Pass additional attributes to the span\n", + " span.update_trace(\n", + " input=input_query,\n", + " output=result,\n", + " user_id=\"user_123\",\n", + " session_id=\"my-agent-session\",\n", + " tags=[\"staging\", \"demo\", \"OpenAI Agent SDK\"],\n", + " metadata={\"email\": \"user@langfuse.com\"},\n", + " version=\"1.0.0\"\n", + " )\n", + " \n", + "# Flush events in short-lived applications\n", + "langfuse.flush()" ] }, { @@ -586,9 +571,9 @@ "from opentelemetry.trace import format_trace_id\n", "import ipywidgets as widgets\n", "from IPython.display import display\n", - "from langfuse import Langfuse\n", - "\n", - "langfuse = Langfuse()\n", + "from langfuse import get_client\n", + " \n", + "langfuse = get_client()\n", "\n", "# Define your agent with the web search tool\n", "agent = Agent(\n", @@ -597,39 +582,41 @@ " tools=[WebSearchTool()]\n", ")\n", "\n", - "formatted_trace_id = None # We'll store the current trace_id globally for demonstration\n", - "\n", "def on_feedback(button):\n", " if button.icon == \"thumbs-up\":\n", - " langfuse.score(\n", + " langfuse.create_score(\n", " value=1,\n", " name=\"user-feedback\",\n", " comment=\"The user gave this response a thumbs up\",\n", - " trace_id=formatted_trace_id\n", + " trace_id=trace_id\n", " )\n", " elif button.icon == \"thumbs-down\":\n", - " langfuse.score(\n", + " langfuse.create_score(\n", " value=0,\n", " name=\"user-feedback\",\n", " comment=\"The user gave this response a thumbs down\",\n", - " trace_id=formatted_trace_id\n", + " trace_id=trace_id\n", " )\n", " print(\"Scored the trace in Langfuse\")\n", "\n", "user_input = input(\"Enter your question: \")\n", "\n", "# Run agent\n", - "with trace.get_tracer(__name__).start_as_current_span(\"OpenAI-Agent-Trace\") as span:\n", - "\n", - " # Run your agent with a query\n", + "with langfuse.start_as_current_span(\n", + " name=\"OpenAI-Agent-Trace\",\n", + " ) as span:\n", + " \n", + " # Run your application here\n", " result = Runner.run_sync(agent, user_input)\n", " print(result.final_output)\n", "\n", - " current_span = trace.get_current_span()\n", - " span_context = current_span.get_span_context()\n", - " trace_id = span_context.trace_id\n", - " formatted_trace_id = str(format_trace_id(trace_id))\n", - " langfuse.trace(id=formatted_trace_id, input=user_input, output=result.final_output)\n", + " result = await main(user_input)\n", + " trace_id = langfuse.get_current_trace_id()\n", + "\n", + " span.update_trace(\n", + " input=user_input,\n", + " output=result.final_output,\n", + " )\n", "\n", "# Get feedback\n", "print(\"How did you like the agent response?\")\n", @@ -640,7 +627,10 @@ "thumbs_up.on_click(on_feedback)\n", "thumbs_down.on_click(on_feedback)\n", "\n", - "display(widgets.HBox([thumbs_up, thumbs_down]))" + "display(widgets.HBox([thumbs_up, thumbs_down]))\n", + "\n", + "# Flush events in short-lived applications\n", + "langfuse.flush()" ] }, { @@ -711,13 +701,15 @@ "input_query = \"Is eating carrots good for the eyes?\"\n", "\n", "# Run agent\n", - "with trace.get_tracer(__name__).start_as_current_span(\"OpenAI-Agent-Trace\") as span:\n", + "with langfuse.start_as_current_span(name=\"OpenAI-Agent-Trace\") as span:\n", " # Run your agent with a query\n", " result = Runner.run_sync(agent, input_query)\n", "\n", " # Add input and output values to parent trace\n", - " span.set_attribute(\"input.value\", input_query)\n", - " span.set_attribute(\"output.value\", result.final_output)" + " span.update_trace(\n", + " input=input_query,\n", + " output=result.final_output,\n", + " )" ] }, { @@ -956,8 +948,8 @@ } ], "source": [ - "from langfuse import Langfuse\n", - "langfuse = Langfuse()\n", + "from langfuse import get_client\n", + "langfuse = get_client()\n", "\n", "langfuse_dataset_name = \"search-dataset_huggingface_openai-agent\"\n", "\n", @@ -1008,7 +1000,7 @@ "#### Running the Agent on the Dataset\n", "\n", "We define a helper function `run_openai_agent()` that:\n", - "1. Starts an OpenTelemetry span\n", + "1. Starts a Langfuse span\n", "2. Runs our agent on the prompt\n", "3. Records the trace ID in Langfuse\n", "\n", @@ -1017,78 +1009,53 @@ }, { "cell_type": "code", - "execution_count": 60, - "metadata": { - "id": "-rYh1PBRfoSS" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from agents import Agent, Runner, WebSearchTool\n", - "from opentelemetry.trace import format_trace_id\n", + "from langfuse import get_client\n", + " \n", + "langfuse = get_client()\n", + "dataset_name = \"search-dataset_huggingface_openai-agent\"\n", + "current_run_name = \"qna_model_v3_run_05_20\" # Identifies this specific evaluation run\n", "\n", - "# Define your agent with the web search tool\n", "agent = Agent(\n", " name=\"WebSearchAgent\",\n", " instructions=\"You are an agent that can search the web.\",\n", " tools=[WebSearchTool(search_context_size= \"high\")]\n", ")\n", - "\n", + " \n", + "# Assume 'run_openai_agent' is your instrumented application function\n", "def run_openai_agent(question):\n", - " with tracer.start_as_current_span(\"OpenAI-Agent-Trace\") as span:\n", - " span.set_attribute(\"langfuse.tag\", \"dataset-run\")\n", - "\n", - " # Run your agent with a query\n", + " with langfuse.start_as_current_generation(name=\"qna-llm-call\") as generation:\n", + " # Simulate LLM call\n", " result = Runner.run_sync(agent, question)\n", - "\n", - " # Get the Langfuse trace_id to link the dataset run item to the agent trace\n", - " current_span = trace.get_current_span()\n", - " span_context = current_span.get_span_context()\n", - " trace_id = span_context.trace_id\n", - " formatted_trace_id = format_trace_id(trace_id)\n", - "\n", - " langfuse_trace = langfuse.trace(\n", - " id=formatted_trace_id,\n", - " input=question,\n", - " output=result.final_output\n", + " \n", + " # Update the trace with the input and output\n", + " generation.update_trace(\n", + " input= question,\n", + " output=result.final_output,\n", " )\n", - " return langfuse_trace, result.final_output" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "collapsed": true, - "id": "nF6JLCsYfoST", - "outputId": "84c3b74e-1aa3-4ef2-f285-f390095b03c0" - }, - "outputs": [], - "source": [ - "dataset = langfuse.get_dataset(langfuse_dataset_name)\n", "\n", - "# Run our agent against each dataset item\n", + " return result.final_output\n", + " \n", + "dataset = langfuse.get_dataset(name=dataset_name) # Fetch your pre-populated dataset\n", + " \n", "for item in dataset.items:\n", - " langfuse_trace, output = run_openai_agent(item.input[\"text\"])\n", - "\n", - " # Link the trace to the dataset item for analysis\n", - " item.link(\n", - " langfuse_trace,\n", - " run_name=\"openai-agent-run-03\",\n", - " run_metadata={ \"search_context_size\": \"high\"}\n", - " )\n", - "\n", - " # Optionally, store a quick evaluation score for demonstration\n", - " langfuse_trace.score(\n", - " name=\"\",\n", - " value=1,\n", - " comment=\"This is a comment\"\n", - " )\n", - "\n", - "# Flush data to ensure all telemetry is sent\n", - "langfuse.flush()" + " \n", + " # Use the item.run() context manager\n", + " with item.run(\n", + " run_name=current_run_name,\n", + " run_metadata={\"model_provider\": \"OpenAI\", \"temperature_setting\": 0.7},\n", + " run_description=\"Evaluation run for Q&A model v3 on May 20th\"\n", + " ) as root_span: # root_span is the root span of the new trace for this item and run.\n", + " # All subsequent langfuse operations within this block are part of this trace.\n", + " \n", + " # Call your application logic\n", + " generated_answer = run_openai_agent(question=item.input[\"text\"])\n", + "\n", + " print(item.input)" ] }, { @@ -1114,7 +1081,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3", + "display_name": ".venv", "language": "python", "name": "python3" },