Skip to content

Commit 14fbf85

Browse files
authored
Merge branch 'main' into fix-issue-1641-database-session-service
2 parents b2a9a35 + 77b869f commit 14fbf85

File tree

12 files changed

+1241
-131
lines changed

12 files changed

+1241
-131
lines changed

src/google/adk/a2a/converters/event_converter.py

Lines changed: 255 additions & 33 deletions
Large diffs are not rendered by default.

src/google/adk/cli/cli_eval.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626

2727
from ..agents import Agent
2828
from ..artifacts.base_artifact_service import BaseArtifactService
29+
from ..evaluation.constants import MISSING_EVAL_DEPENDENCIES_MESSAGE
2930
from ..evaluation.eval_case import EvalCase
3031
from ..evaluation.eval_metrics import EvalMetric
3132
from ..evaluation.eval_metrics import EvalMetricResult
@@ -38,10 +39,6 @@
3839
logger = logging.getLogger("google_adk." + __name__)
3940

4041

41-
MISSING_EVAL_DEPENDENCIES_MESSAGE = (
42-
"Eval module is not installed, please install via `pip install"
43-
" google-adk[eval]`."
44-
)
4542
TOOL_TRAJECTORY_SCORE_KEY = "tool_trajectory_avg_score"
4643
RESPONSE_MATCH_SCORE_KEY = "response_match_score"
4744
# This evaluation is not very stable.
@@ -150,7 +147,7 @@ async def run_evals(
150147
artifact_service: The artifact service to use during inferencing.
151148
"""
152149
try:
153-
from ..evaluation.agent_evaluator import EvaluationGenerator
150+
from ..evaluation.evaluation_generator import EvaluationGenerator
154151
except ModuleNotFoundError as e:
155152
raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
156153

@@ -252,7 +249,8 @@ async def run_evals(
252249
result = "❌ Failed"
253250

254251
print(f"Result: {result}\n")
255-
252+
except ModuleNotFoundError as e:
253+
raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
256254
except Exception:
257255
# Catching the general exception, so that we don't block other eval
258256
# cases.

src/google/adk/cli/cli_tools_click.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,12 +31,12 @@
3131
from . import cli_create
3232
from . import cli_deploy
3333
from .. import version
34+
from ..evaluation.constants import MISSING_EVAL_DEPENDENCIES_MESSAGE
3435
from ..evaluation.gcs_eval_set_results_manager import GcsEvalSetResultsManager
3536
from ..evaluation.gcs_eval_sets_manager import GcsEvalSetsManager
3637
from ..evaluation.local_eval_set_results_manager import LocalEvalSetResultsManager
3738
from ..sessions.in_memory_session_service import InMemorySessionService
3839
from .cli import run_cli
39-
from .cli_eval import MISSING_EVAL_DEPENDENCIES_MESSAGE
4040
from .fast_api import get_fast_api_app
4141
from .utils import envs
4242
from .utils import evals

src/google/adk/evaluation/agent_evaluator.py

Lines changed: 99 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
from __future__ import annotations
16+
1517
import json
1618
import logging
1719
import os
@@ -23,16 +25,16 @@
2325
from typing import Union
2426
import uuid
2527

28+
from google.genai import types as genai_types
2629
from pydantic import ValidationError
2730

31+
from .constants import MISSING_EVAL_DEPENDENCIES_MESSAGE
32+
from .eval_case import IntermediateData
2833
from .eval_set import EvalSet
29-
from .evaluation_generator import EvaluationGenerator
3034
from .evaluator import EvalStatus
3135
from .evaluator import EvaluationResult
3236
from .evaluator import Evaluator
3337
from .local_eval_sets_manager import convert_eval_set_to_pydanctic_schema
34-
from .response_evaluator import ResponseEvaluator
35-
from .trajectory_evaluator import TrajectoryEvaluator
3638

3739
logger = logging.getLogger("google_adk." + __name__)
3840

@@ -96,6 +98,7 @@ async def evaluate_eval_set(
9698
criteria: dict[str, float],
9799
num_runs=NUM_RUNS,
98100
agent_name=None,
101+
print_detailed_results: bool = True,
99102
):
100103
"""Evaluates an agent using the given EvalSet.
101104
@@ -109,14 +112,22 @@ async def evaluate_eval_set(
109112
num_runs: Number of times all entries in the eval dataset should be
110113
assessed.
111114
agent_name: The name of the agent.
115+
print_detailed_results: Whether to print detailed results for each metric
116+
evaluation.
112117
"""
118+
try:
119+
from .evaluation_generator import EvaluationGenerator
120+
except ModuleNotFoundError as e:
121+
raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
113122
eval_case_responses_list = await EvaluationGenerator.generate_responses(
114123
eval_set=eval_set,
115124
agent_module_path=agent_module,
116125
repeat_num=num_runs,
117126
agent_name=agent_name,
118127
)
119128

129+
failures = []
130+
120131
for eval_case_responses in eval_case_responses_list:
121132
actual_invocations = [
122133
invocation
@@ -139,10 +150,25 @@ async def evaluate_eval_set(
139150
)
140151
)
141152

142-
assert evaluation_result.overall_eval_status == EvalStatus.PASSED, (
143-
f"{metric_name} for {agent_module} Failed. Expected {threshold},"
144-
f" but got {evaluation_result.overall_score}."
145-
)
153+
if print_detailed_results:
154+
AgentEvaluator._print_details(
155+
evaluation_result=evaluation_result,
156+
metric_name=metric_name,
157+
threshold=threshold,
158+
)
159+
160+
# Gather all the failures.
161+
if evaluation_result.overall_eval_status != EvalStatus.PASSED:
162+
failures.append(
163+
f"{metric_name} for {agent_module} Failed. Expected {threshold},"
164+
f" but got {evaluation_result.overall_score}."
165+
)
166+
167+
assert not failures, (
168+
"Following are all the test failures. If you looking to get more"
169+
" details on the failures, then please re-run this test with"
170+
" `print_details` set to `True`.\n{}".format("\n".join(failures))
171+
)
146172

147173
@staticmethod
148174
async def evaluate(
@@ -158,9 +184,10 @@ async def evaluate(
158184
agent_module: The path to python module that contains the definition of
159185
the agent. There is convention in place here, where the code is going to
160186
look for 'root_agent' in the loaded module.
161-
eval_dataset_file_path_or_dir: The eval data set. This can be either a string representing
162-
full path to the file containing eval dataset, or a directory that is
163-
recursively explored for all files that have a `.test.json` suffix.
187+
eval_dataset_file_path_or_dir: The eval data set. This can be either a
188+
string representing full path to the file containing eval dataset, or a
189+
directory that is recursively explored for all files that have a
190+
`.test.json` suffix.
164191
num_runs: Number of times all entries in the eval dataset should be
165192
assessed.
166193
agent_name: The name of the agent.
@@ -358,6 +385,11 @@ def _validate_input(eval_dataset, criteria):
358385

359386
@staticmethod
360387
def _get_metric_evaluator(metric_name: str, threshold: float) -> Evaluator:
388+
try:
389+
from .response_evaluator import ResponseEvaluator
390+
from .trajectory_evaluator import TrajectoryEvaluator
391+
except ModuleNotFoundError as e:
392+
raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
361393
if metric_name == TOOL_TRAJECTORY_SCORE_KEY:
362394
return TrajectoryEvaluator(threshold=threshold)
363395
elif (
@@ -367,3 +399,60 @@ def _get_metric_evaluator(metric_name: str, threshold: float) -> Evaluator:
367399
return ResponseEvaluator(threshold=threshold, metric_name=metric_name)
368400

369401
raise ValueError(f"Unsupported eval metric: {metric_name}")
402+
403+
@staticmethod
404+
def _print_details(
405+
evaluation_result: EvaluationResult, metric_name: str, threshold: float
406+
):
407+
try:
408+
from pandas import pandas as pd
409+
from tabulate import tabulate
410+
except ModuleNotFoundError as e:
411+
raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
412+
print(
413+
f"Summary: `{evaluation_result.overall_eval_status}` for Metric:"
414+
f" `{metric_name}`. Expected threshold: `{threshold}`, actual value:"
415+
f" `{evaluation_result.overall_score}`."
416+
)
417+
418+
data = []
419+
for per_invocation_result in evaluation_result.per_invocation_results:
420+
data.append({
421+
"eval_status": per_invocation_result.eval_status,
422+
"score": per_invocation_result.score,
423+
"threshold": threshold,
424+
"prompt": AgentEvaluator._convert_content_to_text(
425+
per_invocation_result.expected_invocation.user_content
426+
),
427+
"expected_response": AgentEvaluator._convert_content_to_text(
428+
per_invocation_result.expected_invocation.final_response
429+
),
430+
"actual_response": AgentEvaluator._convert_content_to_text(
431+
per_invocation_result.actual_invocation.final_response
432+
),
433+
"expected_tool_calls": AgentEvaluator._convert_tool_calls_to_text(
434+
per_invocation_result.expected_invocation.intermediate_data
435+
),
436+
"actual_tool_calls": AgentEvaluator._convert_tool_calls_to_text(
437+
per_invocation_result.actual_invocation.intermediate_data
438+
),
439+
})
440+
441+
print(tabulate(pd.DataFrame(data), headers="keys", tablefmt="grid"))
442+
print("\n\n") # Few empty lines for visual clarity
443+
444+
@staticmethod
445+
def _convert_content_to_text(content: Optional[genai_types.Content]) -> str:
446+
if content and content.parts:
447+
return "\n".join([p.text for p in content.parts if p.text])
448+
449+
return ""
450+
451+
@staticmethod
452+
def _convert_tool_calls_to_text(
453+
intermediate_data: Optional[IntermediateData],
454+
) -> str:
455+
if intermediate_data and intermediate_data.tool_uses:
456+
return "\n".join([str(t) for t in intermediate_data.tool_uses])
457+
458+
return ""
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from __future__ import annotations
16+
17+
MISSING_EVAL_DEPENDENCIES_MESSAGE = (
18+
"Eval module is not installed, please install via `pip install"
19+
" google-adk[eval]`."
20+
)

src/google/adk/evaluation/evaluation_generator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ async def _generate_inferences_from_root_agent(
182182
tool_uses = []
183183
invocation_id = ""
184184

185-
for event in runner.run(
185+
async for event in runner.run_async(
186186
user_id=user_id, session_id=session_id, new_message=user_content
187187
):
188188
invocation_id = (

src/google/adk/models/lite_llm.py

Lines changed: 50 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from typing import Dict
2424
from typing import Generator
2525
from typing import Iterable
26+
from typing import List
2627
from typing import Literal
2728
from typing import Optional
2829
from typing import Tuple
@@ -485,16 +486,22 @@ def _message_to_generate_content_response(
485486

486487
def _get_completion_inputs(
487488
llm_request: LlmRequest,
488-
) -> tuple[Iterable[Message], Iterable[dict]]:
489-
"""Converts an LlmRequest to litellm inputs.
489+
) -> Tuple[
490+
List[Message],
491+
Optional[List[Dict]],
492+
Optional[types.SchemaUnion],
493+
Optional[Dict],
494+
]:
495+
"""Converts an LlmRequest to litellm inputs and extracts generation params.
490496
491497
Args:
492498
llm_request: The LlmRequest to convert.
493499
494500
Returns:
495-
The litellm inputs (message list, tool dictionary and response format).
501+
The litellm inputs (message list, tool dictionary, response format and generation params).
496502
"""
497-
messages = []
503+
# 1. Construct messages
504+
messages: List[Message] = []
498505
for content in llm_request.contents or []:
499506
message_param_or_list = _content_to_message_param(content)
500507
if isinstance(message_param_or_list, list):
@@ -511,7 +518,8 @@ def _get_completion_inputs(
511518
),
512519
)
513520

514-
tools = None
521+
# 2. Convert tool declarations
522+
tools: Optional[List[Dict]] = None
515523
if (
516524
llm_request.config
517525
and llm_request.config.tools
@@ -522,12 +530,39 @@ def _get_completion_inputs(
522530
for tool in llm_request.config.tools[0].function_declarations
523531
]
524532

525-
response_format = None
526-
527-
if llm_request.config.response_schema:
533+
# 3. Handle response format
534+
response_format: Optional[types.SchemaUnion] = None
535+
if llm_request.config and llm_request.config.response_schema:
528536
response_format = llm_request.config.response_schema
529537

530-
return messages, tools, response_format
538+
# 4. Extract generation parameters
539+
generation_params: Optional[Dict] = None
540+
if llm_request.config:
541+
config_dict = llm_request.config.model_dump(exclude_none=True)
542+
# Generate LiteLlm parameters here,
543+
# Following https://docs.litellm.ai/docs/completion/input.
544+
generation_params = {}
545+
param_mapping = {
546+
"max_output_tokens": "max_completion_tokens",
547+
"stop_sequences": "stop",
548+
}
549+
for key in (
550+
"temperature",
551+
"max_output_tokens",
552+
"top_p",
553+
"top_k",
554+
"stop_sequences",
555+
"presence_penalty",
556+
"frequency_penalty",
557+
):
558+
if key in config_dict:
559+
mapped_key = param_mapping.get(key, key)
560+
generation_params[mapped_key] = config_dict[key]
561+
562+
if not generation_params:
563+
generation_params = None
564+
565+
return messages, tools, response_format, generation_params
531566

532567

533568
def _build_function_declaration_log(
@@ -664,7 +699,9 @@ async def generate_content_async(
664699
self._maybe_append_user_content(llm_request)
665700
logger.debug(_build_request_log(llm_request))
666701

667-
messages, tools, response_format = _get_completion_inputs(llm_request)
702+
messages, tools, response_format, generation_params = (
703+
_get_completion_inputs(llm_request)
704+
)
668705

669706
if "functions" in self._additional_args:
670707
# LiteLLM does not support both tools and functions together.
@@ -678,6 +715,9 @@ async def generate_content_async(
678715
}
679716
completion_args.update(self._additional_args)
680717

718+
if generation_params:
719+
completion_args.update(generation_params)
720+
681721
if stream:
682722
text = ""
683723
# Track function calls by index

src/google/adk/tools/google_search_tool.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ async def process_llm_request(
5454
llm_request.config.tools.append(
5555
types.Tool(google_search_retrieval=types.GoogleSearchRetrieval())
5656
)
57-
elif llm_request.model and 'gemini-2' in llm_request.model:
57+
elif llm_request.model and 'gemini-' in llm_request.model:
5858
llm_request.config.tools.append(
5959
types.Tool(google_search=types.GoogleSearch())
6060
)

tests/integration/fixture/trip_planner_agent/initial.session.json

Lines changed: 0 additions & 13 deletions
This file was deleted.

0 commit comments

Comments
 (0)