From b6c049158dfe5805c790774edaa0c692b79d2489 Mon Sep 17 00:00:00 2001 From: Madhu Nunna Date: Mon, 9 Mar 2026 09:30:38 -0700 Subject: [PATCH 1/7] fix: remove uninitialized details_placeholder reference in app_streaming.py In app_streaming.py, details_placeholder was checked in session state but never initialized, causing AttributeError on newer Streamlit versions. Unlike app.py which properly initializes it, app_streaming.py has no use for it. Removed the dead code block. Fixes #140 --- 04-UX-demos/01-streamlit-template/docker_app/app_streaming.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/04-UX-demos/01-streamlit-template/docker_app/app_streaming.py b/04-UX-demos/01-streamlit-template/docker_app/app_streaming.py index b6cd1fc3..75265000 100644 --- a/04-UX-demos/01-streamlit-template/docker_app/app_streaming.py +++ b/04-UX-demos/01-streamlit-template/docker_app/app_streaming.py @@ -85,10 +85,6 @@ def logout(): # Add user message to chat history st.session_state.messages.append({"role": "user", "content": prompt}) - # Clear previous tool usage details - if "details_placeholder" in st.session_state: - st.session_state.details_placeholder.empty() - # Display user message with st.chat_message("user"): st.write(prompt) From 1e6bbc9ad3f4d74b02be10ad078db3cdba84111d Mon Sep 17 00:00:00 2001 From: Madhu Nunna Date: Mon, 30 Mar 2026 11:07:52 -0700 Subject: [PATCH 2/7] fix(streamlit): Initialize details_placeholder at startup Initialize details_placeholder in session state at app startup to prevent uninitialized reference errors while preserving the cleanup behavior that prevents stale tool details from persisting across prompts. This addresses feedback on PR-232 by using the safer approach of initializing the placeholder rather than removing the cleanup call. --- 04-UX-demos/01-streamlit-template/docker_app/app_streaming.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/04-UX-demos/01-streamlit-template/docker_app/app_streaming.py b/04-UX-demos/01-streamlit-template/docker_app/app_streaming.py index b6cd1fc3..39e5f311 100644 --- a/04-UX-demos/01-streamlit-template/docker_app/app_streaming.py +++ b/04-UX-demos/01-streamlit-template/docker_app/app_streaming.py @@ -15,6 +15,10 @@ if "messages" not in st.session_state: st.session_state.messages = [] +# Initialize details placeholder +if "details_placeholder" not in st.session_state: + st.session_state.details_placeholder = st.empty() + # ID of Secrets Manager containing cognito parameters secrets_manager_id = Config.SECRETS_MANAGER_ID From b35917128da5bb74ada39a68dffe9bbb04f273c7 Mon Sep 17 00:00:00 2001 From: Madhu Nunna Date: Mon, 30 Mar 2026 11:07:52 -0700 Subject: [PATCH 3/7] fix(streamlit): Initialize details_placeholder at startup Initialize details_placeholder in session state at app startup to prevent uninitialized reference errors while preserving the cleanup behavior that prevents stale tool details from persisting across prompts. This addresses feedback on PR-232 by using the safer approach of initializing the placeholder rather than removing the cleanup call. --- 04-UX-demos/01-streamlit-template/docker_app/app_streaming.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/04-UX-demos/01-streamlit-template/docker_app/app_streaming.py b/04-UX-demos/01-streamlit-template/docker_app/app_streaming.py index 75265000..74cb50a4 100644 --- a/04-UX-demos/01-streamlit-template/docker_app/app_streaming.py +++ b/04-UX-demos/01-streamlit-template/docker_app/app_streaming.py @@ -15,6 +15,10 @@ if "messages" not in st.session_state: st.session_state.messages = [] +# Initialize details placeholder +if "details_placeholder" not in st.session_state: + st.session_state.details_placeholder = st.empty() + # ID of Secrets Manager containing cognito parameters secrets_manager_id = Config.SECRETS_MANAGER_ID From 6f1eeb4ae9c29b312916e2293d47b88da0f3932d Mon Sep 17 00:00:00 2001 From: Madhu Nunna Date: Mon, 30 Mar 2026 11:17:23 -0700 Subject: [PATCH 4/7] fix(streamlit): Restore cleanup call for details_placeholder Add back the cleanup call that clears details_placeholder between prompts. The previous commit only added initialization but missed restoring the cleanup behavior that prevents stale tool details from persisting. Now the fix properly: - Initializes details_placeholder at startup (prevents crash) - Clears it before each new prompt (prevents stale content) --- 04-UX-demos/01-streamlit-template/docker_app/app_streaming.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/04-UX-demos/01-streamlit-template/docker_app/app_streaming.py b/04-UX-demos/01-streamlit-template/docker_app/app_streaming.py index 74cb50a4..39e5f311 100644 --- a/04-UX-demos/01-streamlit-template/docker_app/app_streaming.py +++ b/04-UX-demos/01-streamlit-template/docker_app/app_streaming.py @@ -89,6 +89,10 @@ def logout(): # Add user message to chat history st.session_state.messages.append({"role": "user", "content": prompt}) + # Clear previous tool usage details + if "details_placeholder" in st.session_state: + st.session_state.details_placeholder.empty() + # Display user message with st.chat_message("user"): st.write(prompt) From c1414dc2a879ae77d6167fedc0c5fc48fa0ffc03 Mon Sep 17 00:00:00 2001 From: Madhu Nunna Date: Mon, 30 Mar 2026 12:57:33 -0700 Subject: [PATCH 5/7] fix(evals): replace deprecated Dataset with Experiment class strands_evals removed Dataset in favor of Experiment. Update 3 evaluation notebooks to use the new API: - Replace 'from strands_evals import Dataset' with Experiment - Replace Dataset() constructor with Experiment() - Update evaluator= (singular) to evaluators=[] (plural list) - Replace Dataset.from_file() with Experiment.from_file() Fixes #213 --- .../03-dataset-generation.ipynb | 14 +++--- .../06-multi-agent-evaluation.ipynb | 44 +++++++++---------- .../05-multi-turn-actor-simulator.ipynb | 4 +- 3 files changed, 31 insertions(+), 31 deletions(-) diff --git a/python/06-evaluate/dataset-generation/03-dataset-generation.ipynb b/python/06-evaluate/dataset-generation/03-dataset-generation.ipynb index 7a83da46..99e18422 100644 --- a/python/06-evaluate/dataset-generation/03-dataset-generation.ipynb +++ b/python/06-evaluate/dataset-generation/03-dataset-generation.ipynb @@ -62,7 +62,7 @@ "| Operation | Method | Use Case |\n", "|:----------|:-------|:---------|\n", "| Save | `dataset.to_file('name.json')` | Preserve for reuse, version control |\n", - "| Load | `Dataset.from_file('name.json')` | Consistent evaluation, team sharing |" + "| Load | `Experiment.from_file('name.json')` | Consistent evaluation, team sharing |" ] }, { @@ -112,7 +112,7 @@ "from strands.multiagent import GraphBuilder\n", "\n", "# Strands Evals imports\n", - "from strands_evals import Dataset, Case\n", + "from strands_evals import Experiment, Case\n", "from strands_evals.generators import DatasetGenerator\n", "from strands_evals.evaluators import OutputEvaluator\n", "\n", @@ -375,7 +375,7 @@ "outputs": [], "source": [ "# Load existing dataset from JSON\n", - "loaded_dataset = Dataset.from_file('scratch_dataset.json')" + "loaded_dataset = Experiment.from_file('scratch_dataset.json')" ] }, { @@ -483,9 +483,9 @@ "outputs": [], "source": [ "# Use first 3 cases for demonstration\n", - "eval_dataset = Dataset(\n", + "eval_dataset = Experiment(\n", " cases=context_dataset.cases[:3],\n", - " evaluator=context_dataset.evaluator\n", + " evaluators=[context_dataset.evaluator]\n", ")\n", "\n", "report = eval_dataset.run_evaluations(agent_task)" @@ -553,7 +553,7 @@ "\n", "```python\n", "# Load any saved dataset\n", - "dataset = Dataset.from_file('dataset_name.json')\n", + "dataset = Experiment.from_file('dataset_name.json')\n", "\n", "# Run evaluation\n", "report = dataset.run_evaluations(agent_task)\n", @@ -607,7 +607,7 @@ "- How to generate contextual test cases from agent capabilities with `from_context_async()`\n", "- How to update existing datasets with edge cases using `update_current_dataset_async()`\n", "- How to save datasets to JSON files with `dataset.to_file()`\n", - "- How to load datasets from JSON files with `Dataset.from_file()`\n", + "- How to load datasets from JSON files with `Experiment.from_file()`\n", "- How to use auto-rubric generation for evaluators\n", "- How to apply topic planning for diverse test coverage\n", "- Best practices for choosing generation strategies\n", diff --git a/python/06-evaluate/multi-agent-evaluation/06-multi-agent-evaluation.ipynb b/python/06-evaluate/multi-agent-evaluation/06-multi-agent-evaluation.ipynb index 70381476..0b5f0d05 100644 --- a/python/06-evaluate/multi-agent-evaluation/06-multi-agent-evaluation.ipynb +++ b/python/06-evaluate/multi-agent-evaluation/06-multi-agent-evaluation.ipynb @@ -120,7 +120,7 @@ "from strands import Agent, tool\n", "\n", "# Strands Evals imports\n", - "from strands_evals import Dataset, Case\n", + "from strands_evals import Experiment, Case\n", "from strands_evals.evaluators import OutputEvaluator, ToolSelectionAccuracyEvaluator, InteractionsEvaluator\n", "from strands_evals.extractors import tools_use_extractor\n", "from strands_evals.types import Interaction\n", @@ -582,44 +582,44 @@ " return {\"output\": str(response), \"trajectory\": session}\n", "\n", "# Create 8 datasets (4 agents x 2 evaluators each)\n", - "tech_output_dataset = Dataset(\n", + "tech_output_dataset = Experiment(\n", " cases=[individual_test_cases[0]],\n", - " evaluator=output_evaluator\n", + " evaluators=[output_evaluator]\n", ")\n", "\n", - "tech_tool_dataset = Dataset(\n", + "tech_tool_dataset = Experiment(\n", " cases=[individual_test_cases[0]],\n", - " evaluator=tool_selection_evaluator\n", + " evaluators=[tool_selection_evaluator]\n", ")\n", "\n", - "billing_output_dataset = Dataset(\n", + "billing_output_dataset = Experiment(\n", " cases=[individual_test_cases[1]],\n", - " evaluator=output_evaluator\n", + " evaluators=[output_evaluator]\n", ")\n", "\n", - "billing_tool_dataset = Dataset(\n", + "billing_tool_dataset = Experiment(\n", " cases=[individual_test_cases[1]],\n", - " evaluator=tool_selection_evaluator\n", + " evaluators=[tool_selection_evaluator]\n", ")\n", "\n", - "product_output_dataset = Dataset(\n", + "product_output_dataset = Experiment(\n", " cases=[individual_test_cases[2]],\n", - " evaluator=output_evaluator\n", + " evaluators=[output_evaluator]\n", ")\n", "\n", - "product_tool_dataset = Dataset(\n", + "product_tool_dataset = Experiment(\n", " cases=[individual_test_cases[2]],\n", - " evaluator=tool_selection_evaluator\n", + " evaluators=[tool_selection_evaluator]\n", ")\n", "\n", - "returns_output_dataset = Dataset(\n", + "returns_output_dataset = Experiment(\n", " cases=[individual_test_cases[3]],\n", - " evaluator=output_evaluator\n", + " evaluators=[output_evaluator]\n", ")\n", "\n", - "returns_tool_dataset = Dataset(\n", + "returns_tool_dataset = Experiment(\n", " cases=[individual_test_cases[3]],\n", - " evaluator=tool_selection_evaluator\n", + " evaluators=[tool_selection_evaluator]\n", ")\n", "\n", "print(\"Evaluating individual agent performance...\\n\")\n", @@ -779,9 +779,9 @@ " return str(response)\n", "\n", "# Create dataset for system evaluation\n", - "system_dataset = Dataset(\n", + "system_dataset = Experiment(\n", " cases=system_test_cases,\n", - " evaluator=system_output_evaluator\n", + " evaluators=[system_output_evaluator]\n", ")\n", "\n", "print(\"Evaluating complete system performance...\\n\")\n", @@ -954,9 +954,9 @@ "outputs": [], "source": [ "# Create dataset for coordination evaluation\n", - "coordination_dataset = Dataset(\n", + "coordination_dataset = Experiment(\n", " cases=system_test_cases,\n", - " evaluator=interaction_evaluator\n", + " evaluators=[interaction_evaluator]\n", ")\n", "\n", "print(\"Evaluating agent coordination quality...\\n\")\n", @@ -1200,4 +1200,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/python/06-evaluate/multi-turn-actor-simulator/05-multi-turn-actor-simulator.ipynb b/python/06-evaluate/multi-turn-actor-simulator/05-multi-turn-actor-simulator.ipynb index 0cadb327..f1c33a71 100644 --- a/python/06-evaluate/multi-turn-actor-simulator/05-multi-turn-actor-simulator.ipynb +++ b/python/06-evaluate/multi-turn-actor-simulator/05-multi-turn-actor-simulator.ipynb @@ -92,7 +92,7 @@ "from strands.models import BedrockModel\n", "\n", "# Strands Evals imports\n", - "from strands_evals import Dataset, Case\n", + "from strands_evals import Experiment, Case\n", "from strands_evals.simulation import ActorSimulator\n", "\n", "# Display utilities\n", @@ -677,7 +677,7 @@ " )\n", "]\n", "\n", - "dataset = Dataset(\n", + "dataset = Experiment(\n", " cases=evaluation_cases\n", ")" ] From b65a60baecf5602c33afd204877d3d18634ab667 Mon Sep 17 00:00:00 2001 From: Madhu Nunna Date: Mon, 30 Mar 2026 12:59:31 -0700 Subject: [PATCH 6/7] fix(evals): update DatasetGenerator to ExperimentGenerator Also fix .evaluator property access to .evaluators[0] and update_current_dataset_async to update_current_experiment_async --- .../03-dataset-generation.ipynb | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/python/06-evaluate/dataset-generation/03-dataset-generation.ipynb b/python/06-evaluate/dataset-generation/03-dataset-generation.ipynb index 99e18422..94225cd9 100644 --- a/python/06-evaluate/dataset-generation/03-dataset-generation.ipynb +++ b/python/06-evaluate/dataset-generation/03-dataset-generation.ipynb @@ -55,7 +55,7 @@ "|:---------|:-------|:---------|\n", "| From Scratch | `from_scratch_async()` | New agents, broad coverage, exploratory testing |\n", "| From Context | `from_context_async()` | Testing specific tools, API integration scenarios |\n", - "| Update Existing | `update_current_dataset_async()` | Adding edge cases, iterative improvement |\n", + "| Update Existing | `update_current_experiment_async()` | Adding edge cases, iterative improvement |\n", "\n", "#### Dataset Persistence\n", "\n", @@ -113,7 +113,7 @@ "\n", "# Strands Evals imports\n", "from strands_evals import Experiment, Case\n", - "from strands_evals.generators import DatasetGenerator\n", + "from strands_evals.generators import ExperimentGenerator\n", "from strands_evals.evaluators import OutputEvaluator\n", "\n", "# Display utilities\n", @@ -179,7 +179,7 @@ "outputs": [], "source": [ "# Initialize dataset generator\n", - "generator = DatasetGenerator(\n", + "generator = ExperimentGenerator(\n", " input_type=str,\n", " output_type=str,\n", " include_expected_output=True,\n", @@ -350,7 +350,7 @@ "source": [ "### Strategy 3: Update Existing Dataset with Edge Cases\n", "\n", - "The `update_current_dataset_async()` method extends an existing dataset by adding new test cases. This is ideal for iteratively improving test coverage by adding edge cases, corner scenarios, or addressing gaps discovered in production.\n", + "The `update_current_experiment_async()` method extends an existing dataset by adding new test cases. This is ideal for iteratively improving test coverage by adding edge cases, corner scenarios, or addressing gaps discovered in production.\n", "\n", "#### Key Features\n", "- **Incremental improvement**: Add tests without starting from scratch\n", @@ -396,7 +396,7 @@ "\"\"\"\n", "\n", "# Update dataset by adding edge cases\n", - "updated_dataset = await generator.update_current_dataset_async(\n", + "updated_dataset = await generator.update_current_experiment_async(\n", " source_dataset=loaded_dataset,\n", " task_description=\"Multi-agent decision system handling complex and edge case scenarios\",\n", " num_cases=6,\n", @@ -407,7 +407,7 @@ "print(f\"\\nOriginal dataset: {len(loaded_dataset.cases)} cases\")\n", "print(f\"Updated dataset: {len(updated_dataset.cases)} cases\")\n", "print(f\"New cases added: {len(updated_dataset.cases) - len(loaded_dataset.cases)}\")\n", - "print(f\"\\nUpdated rubric: {updated_dataset.evaluator.rubric}\")" + "print(f\"\\nUpdated rubric: {updated_dataset.evaluators[0].rubric}\")" ] }, { @@ -485,7 +485,7 @@ "# Use first 3 cases for demonstration\n", "eval_dataset = Experiment(\n", " cases=context_dataset.cases[:3],\n", - " evaluators=[context_dataset.evaluator]\n", + " evaluators=[context_dataset.evaluators[0]]\n", ")\n", "\n", "report = eval_dataset.run_evaluations(agent_task)" @@ -544,7 +544,7 @@ "- Use case: Context-aware testing\n", "\n", "**3. updated_dataset.json**\n", - "- Strategy: update_current_dataset_async()\n", + "- Strategy: update_current_experiment_async()\n", "- Source: scratch_dataset.json + edge cases\n", "- Test cases: 15 (original 9 + 6 new)\n", "- Use case: Iterative improvement with edge cases\n", @@ -583,7 +583,7 @@ "|:---------|:---------|\n", "| `from_scratch_async()` | Starting new project, need broad coverage, no detailed context yet |\n", "| `from_context_async()` | Have well-defined tools/APIs, need tests matching actual capabilities |\n", - "| `update_current_dataset_async()` | Improving existing dataset, discovered gaps, adding edge cases |\n", + "| `update_current_experiment_async()` | Improving existing dataset, discovered gaps, adding edge cases |\n", "\n", "### Key Recommendations\n", "\n", @@ -605,7 +605,7 @@ "\n", "- How to generate test cases from scratch using topics with `from_scratch_async()`\n", "- How to generate contextual test cases from agent capabilities with `from_context_async()`\n", - "- How to update existing datasets with edge cases using `update_current_dataset_async()`\n", + "- How to update existing datasets with edge cases using `update_current_experiment_async()`\n", "- How to save datasets to JSON files with `dataset.to_file()`\n", "- How to load datasets from JSON files with `Experiment.from_file()`\n", "- How to use auto-rubric generation for evaluators\n", From ef37c440c60d24bfde9a55487d680daa2d99b7b2 Mon Sep 17 00:00:00 2001 From: Madhu Nunna Date: Mon, 30 Mar 2026 17:28:06 -0700 Subject: [PATCH 7/7] chore: add Madhu Nunna to CONTRIBUTORS.md --- CONTRIBUTORS.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 816ff856..0e2a6683 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -14,4 +14,5 @@ - [Evandro Franco](https://github.com/evandrofranco) - [Sanghwa Na](https://github.com/didhd) - [Neelam Koshiya](https://github.com/neelamkoshiya) -- [Asif Mithawala](https://github.com/asifma) \ No newline at end of file +- [Asif Mithawala](https://github.com/asifma) +- [Madhu Nunna](https://github.com/madhununna) \ No newline at end of file