google
diff --git a/‎src/google/adk/a2a/converters/event_converter.py
Lines changed: 255 additions & 33 deletions b/‎src/google/adk/a2a/converters/event_converter.py
Lines changed: 255 additions & 33 deletions
diff --git a/‎src/google/adk/cli/cli_eval.py
Lines changed: 4 additions & 6 deletions b/‎src/google/adk/cli/cli_eval.py
Lines changed: 4 additions & 6 deletions
diff --git a/‎src/google/adk/cli/cli_tools_click.py
Lines changed: 1 addition & 1 deletion b/‎src/google/adk/cli/cli_tools_click.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/google/adk/evaluation/agent_evaluator.py
Lines changed: 99 additions & 10 deletions b/‎src/google/adk/evaluation/agent_evaluator.py
Lines changed: 99 additions & 10 deletions
diff --git a/‎src/google/adk/evaluation/constants.py
Lines changed: 20 additions & 0 deletions b/‎src/google/adk/evaluation/constants.py
Lines changed: 20 additions & 0 deletions
diff --git a/‎src/google/adk/evaluation/evaluation_generator.py
Lines changed: 1 addition & 1 deletion b/‎src/google/adk/evaluation/evaluation_generator.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/google/adk/models/lite_llm.py
Lines changed: 50 additions & 10 deletions b/‎src/google/adk/models/lite_llm.py
Lines changed: 50 additions & 10 deletions
diff --git a/‎src/google/adk/tools/google_search_tool.py
Lines changed: 1 addition & 1 deletion b/‎src/google/adk/tools/google_search_tool.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/integration/fixture/trip_planner_agent/initial.session.json
Lines changed: 0 additions & 13 deletions b/‎tests/integration/fixture/trip_planner_agent/initial.session.json
Lines changed: 0 additions & 13 deletions
@@ -26,6 +26,7 @@
 
 from ..agents import Agent
 from ..artifacts.base_artifact_service import BaseArtifactService
+from ..evaluation.constants import MISSING_EVAL_DEPENDENCIES_MESSAGE
 from ..evaluation.eval_case import EvalCase
 from ..evaluation.eval_metrics import EvalMetric
 from ..evaluation.eval_metrics import EvalMetricResult
@@ -38,10 +39,6 @@
 logger = logging.getLogger("google_adk." + __name__)
 
 
-MISSING_EVAL_DEPENDENCIES_MESSAGE = (
-    "Eval module is not installed, please install via `pip install"
-    " google-adk[eval]`."
-)
 TOOL_TRAJECTORY_SCORE_KEY = "tool_trajectory_avg_score"
 RESPONSE_MATCH_SCORE_KEY = "response_match_score"
 # This evaluation is not very stable.
@@ -150,7 +147,7 @@ async def run_evals(
     artifact_service: The artifact service to use during inferencing.
   """
   try:
-    from ..evaluation.agent_evaluator import EvaluationGenerator
+    from ..evaluation.evaluation_generator import EvaluationGenerator
   except ModuleNotFoundError as e:
     raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
 
@@ -252,7 +249,8 @@ async def run_evals(
           result = "❌ Failed"
 
         print(f"Result: {result}\n")
-
+      except ModuleNotFoundError as e:
+        raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
       except Exception:
         # Catching the general exception, so that we don't block other eval
         # cases.
 
@@ -31,12 +31,12 @@
 from . import cli_create
 from . import cli_deploy
 from .. import version
+from ..evaluation.constants import MISSING_EVAL_DEPENDENCIES_MESSAGE
 from ..evaluation.gcs_eval_set_results_manager import GcsEvalSetResultsManager
 from ..evaluation.gcs_eval_sets_manager import GcsEvalSetsManager
 from ..evaluation.local_eval_set_results_manager import LocalEvalSetResultsManager
 from ..sessions.in_memory_session_service import InMemorySessionService
 from .cli import run_cli
-from .cli_eval import MISSING_EVAL_DEPENDENCIES_MESSAGE
 from .fast_api import get_fast_api_app
 from .utils import envs
 from .utils import evals
 
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import json
 import logging
 import os
@@ -23,16 +25,16 @@
 from typing import Union
 import uuid
 
+from google.genai import types as genai_types
 from pydantic import ValidationError
 
+from .constants import MISSING_EVAL_DEPENDENCIES_MESSAGE
+from .eval_case import IntermediateData
 from .eval_set import EvalSet
-from .evaluation_generator import EvaluationGenerator
 from .evaluator import EvalStatus
 from .evaluator import EvaluationResult
 from .evaluator import Evaluator
 from .local_eval_sets_manager import convert_eval_set_to_pydanctic_schema
-from .response_evaluator import ResponseEvaluator
-from .trajectory_evaluator import TrajectoryEvaluator
 
 logger = logging.getLogger("google_adk." + __name__)
 
@@ -96,6 +98,7 @@ async def evaluate_eval_set(
       criteria: dict[str, float],
       num_runs=NUM_RUNS,
       agent_name=None,
+      print_detailed_results: bool = True,
   ):
     """Evaluates an agent using the given EvalSet.
 
@@ -109,14 +112,22 @@ async def evaluate_eval_set(
       num_runs: Number of times all entries in the eval dataset should be
         assessed.
       agent_name: The name of the agent.
+      print_detailed_results: Whether to print detailed results for each metric
+        evaluation.
     """
+    try:
+      from .evaluation_generator import EvaluationGenerator
+    except ModuleNotFoundError as e:
+      raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
     eval_case_responses_list = await EvaluationGenerator.generate_responses(
         eval_set=eval_set,
         agent_module_path=agent_module,
         repeat_num=num_runs,
         agent_name=agent_name,
     )
 
+    failures = []
+
     for eval_case_responses in eval_case_responses_list:
       actual_invocations = [
           invocation
@@ -139,10 +150,25 @@ async def evaluate_eval_set(
             )
         )
 
-        assert evaluation_result.overall_eval_status == EvalStatus.PASSED, (
-            f"{metric_name} for {agent_module} Failed. Expected {threshold},"
-            f" but got {evaluation_result.overall_score}."
-        )
+        if print_detailed_results:
+          AgentEvaluator._print_details(
+              evaluation_result=evaluation_result,
+              metric_name=metric_name,
+              threshold=threshold,
+          )
+
+        # Gather all the failures.
+        if evaluation_result.overall_eval_status != EvalStatus.PASSED:
+          failures.append(
+              f"{metric_name} for {agent_module} Failed. Expected {threshold},"
+              f" but got {evaluation_result.overall_score}."
+          )
+
+    assert not failures, (
+        "Following are all the test failures. If you looking to get more"
+        " details on the failures, then please re-run this test with"
+        " `print_details` set to `True`.\n{}".format("\n".join(failures))
+    )
 
   @staticmethod
   async def evaluate(
@@ -158,9 +184,10 @@ async def evaluate(
       agent_module: The path to python module that contains the definition of
         the agent. There is convention in place here, where the code is going to
         look for 'root_agent' in the loaded module.
-      eval_dataset_file_path_or_dir: The eval data set. This can be either a string representing
-        full path to the file containing eval dataset, or a directory that is
-        recursively explored for all files that have a `.test.json` suffix.
+      eval_dataset_file_path_or_dir: The eval data set. This can be either a
+        string representing full path to the file containing eval dataset, or a
+        directory that is recursively explored for all files that have a
+        `.test.json` suffix.
       num_runs: Number of times all entries in the eval dataset should be
         assessed.
       agent_name: The name of the agent.
@@ -358,6 +385,11 @@ def _validate_input(eval_dataset, criteria):
 
   @staticmethod
   def _get_metric_evaluator(metric_name: str, threshold: float) -> Evaluator:
+    try:
+      from .response_evaluator import ResponseEvaluator
+      from .trajectory_evaluator import TrajectoryEvaluator
+    except ModuleNotFoundError as e:
+      raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
     if metric_name == TOOL_TRAJECTORY_SCORE_KEY:
       return TrajectoryEvaluator(threshold=threshold)
     elif (
@@ -367,3 +399,60 @@ def _get_metric_evaluator(metric_name: str, threshold: float) -> Evaluator:
       return ResponseEvaluator(threshold=threshold, metric_name=metric_name)
 
     raise ValueError(f"Unsupported eval metric: {metric_name}")
+
+  @staticmethod
+  def _print_details(
+      evaluation_result: EvaluationResult, metric_name: str, threshold: float
+  ):
+    try:
+      from pandas import pandas as pd
+      from tabulate import tabulate
+    except ModuleNotFoundError as e:
+      raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
+    print(
+        f"Summary: `{evaluation_result.overall_eval_status}` for Metric:"
+        f" `{metric_name}`. Expected threshold: `{threshold}`, actual value:"
+        f" `{evaluation_result.overall_score}`."
+    )
+
+    data = []
+    for per_invocation_result in evaluation_result.per_invocation_results:
+      data.append({
+          "eval_status": per_invocation_result.eval_status,
+          "score": per_invocation_result.score,
+          "threshold": threshold,
+          "prompt": AgentEvaluator._convert_content_to_text(
+              per_invocation_result.expected_invocation.user_content
+          ),
+          "expected_response": AgentEvaluator._convert_content_to_text(
+              per_invocation_result.expected_invocation.final_response
+          ),
+          "actual_response": AgentEvaluator._convert_content_to_text(
+              per_invocation_result.actual_invocation.final_response
+          ),
+          "expected_tool_calls": AgentEvaluator._convert_tool_calls_to_text(
+              per_invocation_result.expected_invocation.intermediate_data
+          ),
+          "actual_tool_calls": AgentEvaluator._convert_tool_calls_to_text(
+              per_invocation_result.actual_invocation.intermediate_data
+          ),
+      })
+
+    print(tabulate(pd.DataFrame(data), headers="keys", tablefmt="grid"))
+    print("\n\n")  # Few empty lines for visual clarity
+
+  @staticmethod
+  def _convert_content_to_text(content: Optional[genai_types.Content]) -> str:
+    if content and content.parts:
+      return "\n".join([p.text for p in content.parts if p.text])
+
+    return ""
+
+  @staticmethod
+  def _convert_tool_calls_to_text(
+      intermediate_data: Optional[IntermediateData],
+  ) -> str:
+    if intermediate_data and intermediate_data.tool_uses:
+      return "\n".join([str(t) for t in intermediate_data.tool_uses])
+
+    return ""
@@ -0,0 +1,20 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+MISSING_EVAL_DEPENDENCIES_MESSAGE = (
+    "Eval module is not installed, please install via `pip install"
+    " google-adk[eval]`."
+)
@@ -182,7 +182,7 @@ async def _generate_inferences_from_root_agent(
       tool_uses = []
       invocation_id = ""
 
-      for event in runner.run(
+      async for event in runner.run_async(
           user_id=user_id, session_id=session_id, new_message=user_content
       ):
         invocation_id = (
 
@@ -23,6 +23,7 @@
 from typing import Dict
 from typing import Generator
 from typing import Iterable
+from typing import List
 from typing import Literal
 from typing import Optional
 from typing import Tuple
@@ -485,16 +486,22 @@ def _message_to_generate_content_response(
 
 def _get_completion_inputs(
     llm_request: LlmRequest,
-) -> tuple[Iterable[Message], Iterable[dict]]:
-  """Converts an LlmRequest to litellm inputs.
+) -> Tuple[
+    List[Message],
+    Optional[List[Dict]],
+    Optional[types.SchemaUnion],
+    Optional[Dict],
+]:
+  """Converts an LlmRequest to litellm inputs and extracts generation params.
 
   Args:
     llm_request: The LlmRequest to convert.
 
   Returns:
-    The litellm inputs (message list, tool dictionary and response format).
+    The litellm inputs (message list, tool dictionary, response format and generation params).
   """
-  messages = []
+  # 1. Construct messages
+  messages: List[Message] = []
   for content in llm_request.contents or []:
     message_param_or_list = _content_to_message_param(content)
     if isinstance(message_param_or_list, list):
@@ -511,7 +518,8 @@ def _get_completion_inputs(
         ),
     )
 
-  tools = None
+  # 2. Convert tool declarations
+  tools: Optional[List[Dict]] = None
   if (
       llm_request.config
       and llm_request.config.tools
@@ -522,12 +530,39 @@ def _get_completion_inputs(
         for tool in llm_request.config.tools[0].function_declarations
     ]
 
-  response_format = None
-
-  if llm_request.config.response_schema:
+  # 3. Handle response format
+  response_format: Optional[types.SchemaUnion] = None
+  if llm_request.config and llm_request.config.response_schema:
     response_format = llm_request.config.response_schema
 
-  return messages, tools, response_format
+  # 4. Extract generation parameters
+  generation_params: Optional[Dict] = None
+  if llm_request.config:
+    config_dict = llm_request.config.model_dump(exclude_none=True)
+    # Generate LiteLlm parameters here,
+    # Following https://docs.litellm.ai/docs/completion/input.
+    generation_params = {}
+    param_mapping = {
+        "max_output_tokens": "max_completion_tokens",
+        "stop_sequences": "stop",
+    }
+    for key in (
+        "temperature",
+        "max_output_tokens",
+        "top_p",
+        "top_k",
+        "stop_sequences",
+        "presence_penalty",
+        "frequency_penalty",
+    ):
+      if key in config_dict:
+        mapped_key = param_mapping.get(key, key)
+        generation_params[mapped_key] = config_dict[key]
+
+      if not generation_params:
+        generation_params = None
+
+  return messages, tools, response_format, generation_params
 
 
 def _build_function_declaration_log(
@@ -664,7 +699,9 @@ async def generate_content_async(
     self._maybe_append_user_content(llm_request)
     logger.debug(_build_request_log(llm_request))
 
-    messages, tools, response_format = _get_completion_inputs(llm_request)
+    messages, tools, response_format, generation_params = (
+        _get_completion_inputs(llm_request)
+    )
 
     if "functions" in self._additional_args:
       # LiteLLM does not support both tools and functions together.
@@ -678,6 +715,9 @@ async def generate_content_async(
     }
     completion_args.update(self._additional_args)
 
+    if generation_params:
+      completion_args.update(generation_params)
+
     if stream:
       text = ""
       # Track function calls by index
 
@@ -54,7 +54,7 @@ async def process_llm_request(
       llm_request.config.tools.append(
           types.Tool(google_search_retrieval=types.GoogleSearchRetrieval())
       )
-    elif llm_request.model and 'gemini-2' in llm_request.model:
+    elif llm_request.model and 'gemini-' in llm_request.model:
       llm_request.config.tools.append(
           types.Tool(google_search=types.GoogleSearch())
       )
Original file line number	Diff line number	Diff line change
`@@ -54,7 +54,7 @@ async def process_llm_request(`
`54`	`54`	`llm_request.config.tools.append(`
`55`	`55`	`types.Tool(google_search_retrieval=types.GoogleSearchRetrieval())`
`56`	`56`	`)`
`57`		`- elif llm_request.model and 'gemini-2' in llm_request.model:`
	`57`	`+ elif llm_request.model and 'gemini-' in llm_request.model:`
`58`	`58`	`llm_request.config.tools.append(`
`59`	`59`	`types.Tool(google_search=types.GoogleSearch())`
`60`	`60`	`)`