From efa63b60640e06787a740b96f8b3f88199d5204d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daiane=20Galv=C3=A3o?=
 <104103307+daiane-galvao@users.noreply.github.com>
Date: Mon, 30 Jun 2025 17:48:03 -0300
Subject: [PATCH 1/4] feat(metrics): add ToolCallF1 for evaluating tool call
 precision, recall and F1-score

---
 .../metrics/available_metrics/agents.md       | 101 ++++++++++++++++++
 ragas/src/ragas/metrics/_tool_call_f1.py      |  53 +++++++++
 ragas/tests/unit/test_tool_call_f1.py         |  65 +++++++++++
 3 files changed, 219 insertions(+)
 create mode 100644 ragas/src/ragas/metrics/_tool_call_f1.py
 create mode 100644 ragas/tests/unit/test_tool_call_f1.py

diff --git a/docs/concepts/metrics/available_metrics/agents.md b/docs/concepts/metrics/available_metrics/agents.md
index 156475a5a..20d24f6bf 100644
--- a/docs/concepts/metrics/available_metrics/agents.md
+++ b/docs/concepts/metrics/available_metrics/agents.md
@@ -122,6 +122,107 @@ from ragas.metrics._tool_call_accuracy import ToolCallAccuracy
 metric = ToolCallAccuracy()
 metric.arg_comparison_metric = NonLLMStringSimilarity()
 ```
+## Tool Call F1
+
+`ToolCallF1` is a metric that return F1-score based on precision and recall of tool calls made by an agent, comparing them to a set of expected calls (`reference_tool_calls`). While `ToolCallAccuracy` provides a binary score based on exact order and content match, `ToolCallF1` complements it by offering a softer evaluation useful for onboarding and iteration. It helps quantify how close the agent was to the expected behavior even if it over- or under-calls.
+
+### Formula
+
+ToolCallF1 is based on classic IR metrics:
+
+$$
+\text{Precision} = \frac{\text{tool calls that match both name and parameters}}{\text{tool calls that match both name and parameters} + \text{extra tool calls that were not expected}}
+$$
+
+$$
+\text{Recall} = \frac{\text{tool calls that match both name and parameters}}{\text{tool calls that match both name and parameters} + \text{expected tool calls that were not made}}
+$$
+
+$$
+\text{F1} = \frac{2 \cdot \text{Precision} \cdot \text{Recall}}{\text{Precision} + \text{Recall}}
+$$
+
+### How is it different from Topic Adherence?
+
+While both `ToolCallF1` and `TopicAdherenceScore` uses precision, recall, and F1-score, they evaluate different aspects:
+
+| Metric                | Evaluates                               | Based on                     |
+| --------------------- | --------------------------------------- | ---------------------------- |
+| `ToolCallF1`          | Correctness of tool executions          | Structured tool call objects |
+| `TopicAdherenceScore` | Whether the conversation stays on-topic | Comparison of domain topics  |
+
+Use `ToolCallF1` when you want to track whether the agent correctly **executed tools**. Use `TopicAdherenceScore` when evaluating whether the **content or intention** stays within allowed topics.
+
+### Example: Matching Expected Tool Calls
+
+```python
+from ragas.metrics import ToolCallF1
+from ragas.dataset_schema import MultiTurnSample
+from ragas.messages import HumanMessage, AIMessage, ToolMessage, ToolCall
+
+sample = [
+    HumanMessage(content="What's the weather like in Paris today?"),
+    AIMessage(content="Let me check that for you.", tool_calls=[
+        ToolCall(name="weather_check", args={"location": "Paris"})
+    ]),
+    HumanMessage(content="And the UV index?"),
+    AIMessage(content="Sure, here's the UV index for Paris.", tool_calls=[
+        ToolCall(name="uv_index_lookup", args={"location": "Paris"})
+    ])
+]
+
+sample = MultiTurnSample(
+    user_input=sample,
+    reference_tool_calls=[
+        ToolCall(name="weather_check", args={"location": "Paris"}),
+        ToolCall(name="uv_index_lookup", args={"location": "Paris"})
+    ]
+)
+
+scorer = ToolCallF1()
+await scorer.multi_turn_ascore(sample)
+```
+
+Output
+
+```
+1.0
+```
+
+### Example: Extra Tool Called
+
+```python
+sample = [
+    HumanMessage(content="What's the weather like in Paris today?"),
+    AIMessage(content="Let me check that for you.", tool_calls=[
+        ToolCall(name="weather_check", args={"location": "Paris"})
+    ]),
+    HumanMessage(content="And the UV index?"),
+    AIMessage(content="Sure, here's the UV index for Paris.", tool_calls=[
+        ToolCall(name="uv_index_lookup", args={"location": "Paris"}),
+        ToolCall(name="air_quality", args={"location": "Paris"})  # extra call
+    ])
+]
+
+sample = MultiTurnSample(
+    user_input=sample,
+    reference_tool_calls=[
+        ToolCall(name="uv_index_lookup", args={"location": "Paris"}),
+        ToolCall(name="weather_check", args={"location": "Paris"})
+    ]
+)
+
+await scorer.multi_turn_ascore(sample)
+```
+
+Output
+
+```
+0.67
+```
+
+In this case, the agent calls both correct tools but adds an extra `air_quality` call. The F1-score reflects partial correctness instead of failing the example completely.
+
 
 ## Agent Goal accuracy
 
diff --git a/ragas/src/ragas/metrics/_tool_call_f1.py b/ragas/src/ragas/metrics/_tool_call_f1.py
new file mode 100644
index 000000000..c97047395
--- /dev/null
+++ b/ragas/src/ragas/metrics/_tool_call_f1.py
@@ -0,0 +1,53 @@
+from __future__ import annotations
+
+import typing as t
+from dataclasses import dataclass, field
+
+from ragas.metrics.base import MultiTurnMetric, MetricType
+from ragas.types import MultiTurnSample
+from ragas.utils.typing import ScoreType
+
+if t.TYPE_CHECKING:
+    from langchain_core.callbacks.base import Callbacks
+
+
+@dataclass
+class ToolCallF1(MultiTurnMetric):
+    name: str = "tool_call_f1"
+    batch_size: int = 1
+    is_multi_turn: bool = True
+    _required_columns: t.Dict[MetricType, t.Set[str]] = field(
+        default_factory=lambda: {
+            MetricType.MULTI_TURN: {
+                "reference_tool_calls",
+                "agent_messages",
+            }
+        }
+    )
+
+    async def _multi_turn_ascore(
+        self, sample: MultiTurnSample, callbacks: t.Optional[Callbacks] = None
+    ) -> ScoreType:
+        expected: set[tuple[str, frozenset]] = set()
+        if sample.reference_tool_calls:
+            for call in sample.reference_tool_calls:
+                expected.add((call.name, frozenset(call.parameters.items())))
+
+        actual: set[tuple[str, frozenset]] = set()
+        for msg in sample.agent_messages:
+            if msg.tool_calls:
+                for call in msg.tool_calls:
+                    actual.add((call.name, frozenset(call.parameters.items())))
+
+        tp = len(actual & expected)
+        fp = len(actual - expected)
+        fn = len(expected - actual)
+
+        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
+        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
+        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
+
+        return round(f1, 4)
+
+    async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> ScoreType:
+        return await self._multi_turn_ascore(MultiTurnSample(**row), callbacks)
diff --git a/ragas/tests/unit/test_tool_call_f1.py b/ragas/tests/unit/test_tool_call_f1.py
new file mode 100644
index 000000000..26ce71757
--- /dev/null
+++ b/ragas/tests/unit/test_tool_call_f1.py
@@ -0,0 +1,65 @@
+import pytest
+from ragas.messages import ToolCall, AIMessage, HumanMessage
+from ragas.types import MultiTurnSample
+from ragas.metrics._tool_call_f1 import ToolCallF1
+
+metric = ToolCallF1()
+
+def make_sample(expected, predicted):
+    return MultiTurnSample(
+        user_input=[HumanMessage(content="What is the weather in Paris?")],
+        agent_messages=[
+            AIMessage(
+                content="Let me check the weather forecast",
+                tool_calls=predicted
+            )
+        ],
+        reference_tool_calls=expected,
+        reference="Expected correct weather tool call"
+    )
+
+def test_tool_call_f1_full_match():
+    expected = [
+        ToolCall(name="WeatherForecast", parameters={"location": "Paris"})
+    ]
+    predicted = [
+        ToolCall(name="WeatherForecast", parameters={"location": "Paris"})
+    ]
+    sample = make_sample(expected, predicted)
+    score = pytest.run(metric._multi_turn_ascore(sample))
+    assert score == 1.0
+
+def test_tool_call_f1_partial_match():
+    expected = [
+        ToolCall(name="WeatherForecast", parameters={"location": "Paris"}),
+        ToolCall(name="UVIndex", parameters={"location": "Paris"})
+    ]
+    predicted = [
+        ToolCall(name="WeatherForecast", parameters={"location": "Paris"})
+    ]
+    sample = make_sample(expected, predicted)
+    score = pytest.run(metric._multi_turn_ascore(sample))
+    assert round(score, 2) == 0.67
+
+def test_tool_call_f1_no_match():
+    expected = [
+        ToolCall(name="WeatherForecast", parameters={"location": "Paris"})
+    ]
+    predicted = [
+        ToolCall(name="AirQuality", parameters={"location": "Paris"})
+    ]
+    sample = make_sample(expected, predicted)
+    score = pytest.run(metric._multi_turn_ascore(sample))
+    assert score == 0.0
+
+def test_tool_call_f1_extra_call():
+    expected = [
+        ToolCall(name="WeatherForecast", parameters={"location": "Paris"})
+    ]
+    predicted = [
+        ToolCall(name="WeatherForecast", parameters={"location": "Paris"}),
+        ToolCall(name="AirQuality", parameters={"location": "Paris"})
+    ]
+    sample = make_sample(expected, predicted)
+    score = pytest.run(metric._multi_turn_ascore(sample))
+    assert round(score, 2) == 0.67

From 4a3ce83363fded532d03f9b57ebbbf652e60dfc1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daiane=20Galv=C3=A3o?=
 <104103307+daiane-galvao@users.noreply.github.com>
Date: Mon, 30 Jun 2025 19:54:04 -0300
Subject: [PATCH 2/4] feat(metrics): register ToolCallF1

---
 ragas/src/ragas/metrics/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ragas/src/ragas/metrics/__init__.py b/ragas/src/ragas/metrics/__init__.py
index 381203031..1e4c0692d 100644
--- a/ragas/src/ragas/metrics/__init__.py
+++ b/ragas/src/ragas/metrics/__init__.py
@@ -63,6 +63,7 @@
 )
 from ragas.metrics._summarization import SummarizationScore, summarization_score
 from ragas.metrics._tool_call_accuracy import ToolCallAccuracy
+from ragas.metrics._tool_call_f1 import ToolCallF1
 from ragas.metrics._topic_adherence import TopicAdherenceScore
 from ragas.metrics.base import (
     Metric,
@@ -126,6 +127,7 @@
     "LLMSQLEquivalence",
     "AgentGoalAccuracyWithoutReference",
     "AgentGoalAccuracyWithReference",
+    "ToolCallF1",
     "ToolCallAccuracy",
     "ResponseRelevancy",
     "SemanticSimilarity",

From d51abf304ab4ba67082b97903a47701d9d8a9061 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daiane=20Galv=C3=A3o?=
 <104103307+daiane-galvao@users.noreply.github.com>
Date: Tue, 1 Jul 2025 09:58:59 -0300
Subject: [PATCH 3/4] feat(metrics): PR suggestions ToolCallF1

---
 docs/concepts/metrics/available_metrics/agents.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/concepts/metrics/available_metrics/agents.md b/docs/concepts/metrics/available_metrics/agents.md
index 20d24f6bf..3aee5c7ce 100644
--- a/docs/concepts/metrics/available_metrics/agents.md
+++ b/docs/concepts/metrics/available_metrics/agents.md
@@ -128,7 +128,7 @@ metric.arg_comparison_metric = NonLLMStringSimilarity()
 
 ### Formula
 
-ToolCallF1 is based on classic IR metrics:
+ToolCallF1 is based on classic IR metrics.  It uses unordered matching: the order in which the tools are called does not impact the result, only the presence and correctness of tool names and parameters are considered.
 
 $$
 \text{Precision} = \frac{\text{tool calls that match both name and parameters}}{\text{tool calls that match both name and parameters} + \text{extra tool calls that were not expected}}

From 68209da58d8a04a6fee45fc582da997ebed81976 Mon Sep 17 00:00:00 2001
From: Daiane Galvao <daianegalvao@microsoft.com>
Date: Thu, 3 Jul 2025 16:17:41 -0300
Subject: [PATCH 4/4] fix(tool_call_f1): changes suggestions by AI pipeline

---
 ragas/src/ragas/metrics/_tool_call_f1.py | 27 +++++----
 ragas/tests/unit/test_tool_call_f1.py    | 72 +++++++++++-------------
 2 files changed, 51 insertions(+), 48 deletions(-)

diff --git a/ragas/src/ragas/metrics/_tool_call_f1.py b/ragas/src/ragas/metrics/_tool_call_f1.py
index c97047395..97cd21387 100644
--- a/ragas/src/ragas/metrics/_tool_call_f1.py
+++ b/ragas/src/ragas/metrics/_tool_call_f1.py
@@ -4,8 +4,8 @@
 from dataclasses import dataclass, field
 
 from ragas.metrics.base import MultiTurnMetric, MetricType
-from ragas.types import MultiTurnSample
-from ragas.utils.typing import ScoreType
+from ragas.dataset_schema import MultiTurnSample
+from ragas.messages import AIMessage
 
 if t.TYPE_CHECKING:
     from langchain_core.callbacks.base import Callbacks
@@ -20,24 +20,27 @@ class ToolCallF1(MultiTurnMetric):
         default_factory=lambda: {
             MetricType.MULTI_TURN: {
                 "reference_tool_calls",
-                "agent_messages",
+                "user_input",
             }
         }
     )
 
+    def init(self, run_config):
+        pass
+
     async def _multi_turn_ascore(
         self, sample: MultiTurnSample, callbacks: t.Optional[Callbacks] = None
-    ) -> ScoreType:
+    ) -> float:
         expected: set[tuple[str, frozenset]] = set()
         if sample.reference_tool_calls:
             for call in sample.reference_tool_calls:
-                expected.add((call.name, frozenset(call.parameters.items())))
+                expected.add((call.name, frozenset(call.args.items())))
 
         actual: set[tuple[str, frozenset]] = set()
-        for msg in sample.agent_messages:
-            if msg.tool_calls:
+        for msg in sample.user_input:
+            if isinstance(msg, AIMessage) and msg.tool_calls is not None:
                 for call in msg.tool_calls:
-                    actual.add((call.name, frozenset(call.parameters.items())))
+                    actual.add((call.name, frozenset(call.args.items())))
 
         tp = len(actual & expected)
         fp = len(actual - expected)
@@ -45,9 +48,13 @@ async def _multi_turn_ascore(
 
         precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
         recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
-        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
+        f1 = (
+            2 * precision * recall / (precision + recall)
+            if (precision + recall) > 0
+            else 0.0
+        )
 
         return round(f1, 4)
 
-    async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> ScoreType:
+    async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
         return await self._multi_turn_ascore(MultiTurnSample(**row), callbacks)
diff --git a/ragas/tests/unit/test_tool_call_f1.py b/ragas/tests/unit/test_tool_call_f1.py
index 26ce71757..e9575b7c0 100644
--- a/ragas/tests/unit/test_tool_call_f1.py
+++ b/ragas/tests/unit/test_tool_call_f1.py
@@ -1,65 +1,61 @@
 import pytest
 from ragas.messages import ToolCall, AIMessage, HumanMessage
-from ragas.types import MultiTurnSample
-from ragas.metrics._tool_call_f1 import ToolCallF1
+from ragas import MultiTurnSample
+from ragas.metrics import ToolCallF1
 
 metric = ToolCallF1()
 
+
 def make_sample(expected, predicted):
     return MultiTurnSample(
-        user_input=[HumanMessage(content="What is the weather in Paris?")],
-        agent_messages=[
+        user_input=[
+            HumanMessage(content="What is the weather in Paris?"),
             AIMessage(
-                content="Let me check the weather forecast",
-                tool_calls=predicted
-            )
+                content="Let me check the weather forecast", tool_calls=predicted
+            ),
         ],
         reference_tool_calls=expected,
-        reference="Expected correct weather tool call"
+        reference="Expected correct weather tool call",
     )
 
-def test_tool_call_f1_full_match():
-    expected = [
-        ToolCall(name="WeatherForecast", parameters={"location": "Paris"})
-    ]
-    predicted = [
-        ToolCall(name="WeatherForecast", parameters={"location": "Paris"})
-    ]
+
+@pytest.mark.asyncio
+async def test_tool_call_f1_full_match():
+    expected = [ToolCall(name="WeatherForecast", args={"location": "Paris"})]
+    predicted = [ToolCall(name="WeatherForecast", args={"location": "Paris"})]
     sample = make_sample(expected, predicted)
-    score = pytest.run(metric._multi_turn_ascore(sample))
+    score = await metric._multi_turn_ascore(sample)
     assert score == 1.0
 
-def test_tool_call_f1_partial_match():
+
+@pytest.mark.asyncio
+async def test_tool_call_f1_partial_match():
     expected = [
-        ToolCall(name="WeatherForecast", parameters={"location": "Paris"}),
-        ToolCall(name="UVIndex", parameters={"location": "Paris"})
-    ]
-    predicted = [
-        ToolCall(name="WeatherForecast", parameters={"location": "Paris"})
+        ToolCall(name="WeatherForecast", args={"location": "Paris"}),
+        ToolCall(name="UVIndex", args={"location": "Paris"}),
     ]
+    predicted = [ToolCall(name="WeatherForecast", args={"location": "Paris"})]
     sample = make_sample(expected, predicted)
-    score = pytest.run(metric._multi_turn_ascore(sample))
+    score = await metric._multi_turn_ascore(sample)
     assert round(score, 2) == 0.67
 
-def test_tool_call_f1_no_match():
-    expected = [
-        ToolCall(name="WeatherForecast", parameters={"location": "Paris"})
-    ]
-    predicted = [
-        ToolCall(name="AirQuality", parameters={"location": "Paris"})
-    ]
+
+@pytest.mark.asyncio
+async def test_tool_call_f1_no_match():
+    expected = [ToolCall(name="WeatherForecast", args={"location": "Paris"})]
+    predicted = [ToolCall(name="AirQuality", args={"location": "Paris"})]
     sample = make_sample(expected, predicted)
-    score = pytest.run(metric._multi_turn_ascore(sample))
+    score = await metric._multi_turn_ascore(sample)
     assert score == 0.0
 
-def test_tool_call_f1_extra_call():
-    expected = [
-        ToolCall(name="WeatherForecast", parameters={"location": "Paris"})
-    ]
+
+@pytest.mark.asyncio
+async def test_tool_call_f1_extra_call():
+    expected = [ToolCall(name="WeatherForecast", args={"location": "Paris"})]
     predicted = [
-        ToolCall(name="WeatherForecast", parameters={"location": "Paris"}),
-        ToolCall(name="AirQuality", parameters={"location": "Paris"})
+        ToolCall(name="WeatherForecast", args={"location": "Paris"}),
+        ToolCall(name="AirQuality", args={"location": "Paris"}),
     ]
     sample = make_sample(expected, predicted)
-    score = pytest.run(metric._multi_turn_ascore(sample))
+    score = await metric._multi_turn_ascore(sample)
     assert round(score, 2) == 0.67