feat: llm as prompt as optional (#2084)

shahules786 · web-flow · commit 47c60b594c61 · 2025-06-18T15:46:11.000-07:00
-  llm based metric
```py
test_metric = DiscreteMetric(
    name="test_metric",
    prompt = "Is the {response} a good response to the query {query}?",
    values=["pass", "fail"],
)
```

- Writing custom metric logic

```py

@numeric_metric(
    name="test_metric",
    range=(0, 1),
)
def test_metric(
    query: str,
    response: str,
) -&gt; MetricResult:
    """
    Is the response a good response to the query?
    """
    result = 0
    return MetricResult(result=result, reason="")
```
diff --git a/experimental/ragas_experimental/metric/base.py b/experimental/ragas_experimental/metric/base.py
@@ -6,7 +6,7 @@
 __all__ = ['Metric']
 
 # %% ../../nbs/api/metric/base.ipynb 2
-from abc import ABC, abstractmethod
+from abc import ABC
 import asyncio
 from dataclasses import dataclass, field
 from pydantic import BaseModel
@@ -31,24 +31,13 @@ class Metric(ABC):
     """Base class for all metrics in the LLM evaluation library."""
 
     name: str
-    prompt: str | Prompt
-    llm: RagasLLM
-    _response_models: t.Dict[bool, t.Type[BaseModel]] = field(
-        default_factory=dict, init=False, repr=False
-    )
+    prompt: t.Optional[t.Union[str, Prompt]] = None
+    _response_model: t.Type[BaseModel] = field(init=False)
 
     def __post_init__(self):
         if isinstance(self.prompt, str):
             self.prompt = Prompt(self.prompt)
 
-    @abstractmethod
-    def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]:
-        """Get the appropriate response model."""
-        pass
-
-    @abstractmethod
-    def _ensemble(self, results: t.List[MetricResult]) -> MetricResult:
-        pass
 
     def get_variables(self) -> t.List[str]:
         if isinstance(self.prompt, Prompt):
@@ -62,54 +51,49 @@ def get_variables(self) -> t.List[str]:
         ]
         return vars
 
-    def score(self, reasoning: bool = True, n: int = 1, **kwargs) -> t.Any:
-        responses = []
+    def score(self, llm: RagasLLM, **kwargs) -> MetricResult:
+        
         traces = {}
         traces["input"] = kwargs
         prompt_input = self.prompt.format(**kwargs)
-        for _ in range(n):
-            response = self.llm.generate(
-                prompt_input, response_model=self._get_response_model(reasoning)
-            )
-            traces["output"] = response.model_dump()
-            response = MetricResult(**response.model_dump())
-            responses.append(response)
-        results = self._ensemble(responses)
-        results.traces = traces
-        return results
+        response = llm.generate(
+            prompt_input, response_model=self._response_model
+        )
+        traces["output"] = response.model_dump()
+        result = MetricResult(**response.model_dump())
+        result.traces = traces
+        return result
 
     async def ascore(
-        self, reasoning: bool = True, n: int = 1, **kwargs
+        self, llm: RagasLLM, **kwargs
     ) -> MetricResult:
-        responses = []  # Added missing initialization
+       
         traces = {}
-        traces["input"] = kwargs
+        
         prompt_input = self.prompt.format(**kwargs)
-        for _ in range(n):
-            response = await self.llm.agenerate(
-                prompt_input, response_model=self._get_response_model(reasoning)
-            )
-            traces["output"] = response.model_dump()
-            response = MetricResult(
-                **response.model_dump()
-            )  # Fixed missing parentheses
-            responses.append(response)
-        results = self._ensemble(responses)
-        results.traces = traces
-        return results
+        traces["input"] = prompt_input
+        response = await llm.agenerate(
+            prompt_input, response_model=self._response_model,
+        )
+        traces["output"] = response.model_dump()
+        result = MetricResult(
+            **response.model_dump()
+        )  # Fixed missing parentheses
+        result.traces = traces
+        return result
 
     def batch_score(
-        self, inputs: t.List[t.Dict[str, t.Any]], reasoning: bool = True, n: int = 1
+        self, llm: RagasLLM, inputs: t.List[t.Dict[str, t.Any]],
     ) -> t.List[t.Any]:
-        return [self.score(reasoning, n, **input_dict) for input_dict in inputs]
+        return [self.score(llm, **input_dict) for input_dict in inputs]
 
     async def abatch_score(
-        self, inputs: t.List[t.Dict[str, t.Any]], reasoning: bool = True, n: int = 1
+        self, llm: RagasLLM, inputs: t.List[t.Dict[str, t.Any]],
     ) -> t.List[MetricResult]:
         async_tasks = []
         for input_dict in inputs:
             # Add reasoning and n to the input parameters
-            async_tasks.append(self.ascore(reasoning=reasoning, n=n, **input_dict))
+            async_tasks.append(self.ascore(llm, **input_dict))
 
         # Run all tasks concurrently and return results
         return await asyncio.gather(*async_tasks)
diff --git a/experimental/ragas_experimental/metric/decorator.py b/experimental/ragas_experimental/metric/decorator.py
@@ -12,7 +12,6 @@
 from dataclasses import dataclass
 from . import MetricResult
 from ..llm import RagasLLM
-from ..prompt.base import Prompt
 
 
 def create_metric_decorator(metric_class):
@@ -27,8 +26,6 @@ def create_metric_decorator(metric_class):
     """
 
     def decorator_factory(
-        llm: RagasLLM,
-        prompt: t.Union[str, Prompt],
         name: t.Optional[str] = None,
         **metric_params,
     ):
@@ -50,24 +47,62 @@ def decorator(func):
             # Get metric name and check if function is async
             metric_name = name or func.__name__
             is_async = inspect.iscoroutinefunction(func)
+            
+            # Check function signature to determine if it expects llm/prompt
+            sig = inspect.signature(func)
+            param_names = list(sig.parameters.keys())
+            expects_llm = 'llm' in param_names
+            expects_prompt = 'prompt' in param_names
 
             # TODO: Move to dataclass type implementation
             @dataclass
             class CustomMetric(metric_class):
+                
+                def _validate_result_value(self, result_value):
+                    """Validate result value based on metric type constraints."""
+                    # Discrete metric validation
+                    if hasattr(self, 'values') and result_value not in self.values:
+                        return f"Metric {self.name} returned '{result_value}' but expected one of {self.values}"
+                    
+                    # Numeric metric validation
+                    if hasattr(self, 'range'):
+                        if not isinstance(result_value, (int, float)):
+                            return f"Metric {self.name} returned '{result_value}' but expected a numeric value"
+                        min_val, max_val = self.range
+                        if not (min_val <= result_value <= max_val):
+                            return f"Metric {self.name} returned {result_value} but expected value in range {self.range}"
+                    
+                    # Ranking metric validation
+                    if hasattr(self, 'num_ranks'):
+                        if not isinstance(result_value, list):
+                            return f"Metric {self.name} returned '{result_value}' but expected a list"
+                        if len(result_value) != self.num_ranks:
+                            return f"Metric {self.name} returned list of length {len(result_value)} but expected {self.num_ranks} items"
+                    
+                    return None  # No validation error
 
                 def _run_sync_in_async(self, func, *args, **kwargs):
                     """Run a synchronous function in an async context."""
                     # For sync functions, just run them normally
                     return func(*args, **kwargs)
 
-                def _execute_metric(self, is_async_execution, reasoning, **kwargs):
+                def _execute_metric(self, llm, is_async_execution, **kwargs):
                     """Execute the metric function with proper async handling."""
                     try:
+                        # Prepare function arguments based on what the function expects
+                        func_kwargs = kwargs.copy()
+                        func_args = []
+                        
+                        if expects_llm:
+                            func_args.append(llm)
+                        if expects_prompt:
+                            func_args.append(self.prompt)
+                        
                         if is_async:
                             # Async function implementation
                             if is_async_execution:
                                 # In async context, await the function directly
-                                result = func(self.llm, self.prompt, **kwargs)
+                                result = func(*func_args, **func_kwargs)
                             else:
                                 # In sync context, run the async function in an event loop
                                 try:
@@ -76,40 +111,68 @@ def _execute_metric(self, is_async_execution, reasoning, **kwargs):
                                     loop = asyncio.new_event_loop()
                                     asyncio.set_event_loop(loop)
                                 result = loop.run_until_complete(
-                                    func(self.llm, self.prompt, **kwargs)
+                                    func(*func_args, **func_kwargs)
                                 )
                         else:
                             # Sync function implementation
-                            result = func(self.llm, self.prompt, **kwargs)
-
+                            result = func(*func_args, **func_kwargs)
+
+                        # Ensure result is a MetricResult
+                        if not isinstance(result, MetricResult):
+                            raise ValueError(f"Custom metric function must return MetricResult, got {type(result)}")
+                        
+                        # Validate the result based on metric type
+                        validation_error = self._validate_result_value(result.result)
+                        if validation_error:
+                            return MetricResult(result=None, reason=validation_error)
+                        
                         return result
+
                     except Exception as e:
                         # Handle errors gracefully
                         error_msg = f"Error executing metric {self.name}: {str(e)}"
                         return MetricResult(result=None, reason=error_msg)
 
-                def score(self, reasoning: bool = True, n: int = 1, **kwargs):
+                def score(self, llm: t.Optional[RagasLLM] = None, **kwargs):
                     """Synchronous scoring method."""
                     return self._execute_metric(
-                        is_async_execution=False, reasoning=reasoning, **kwargs
+                        llm, is_async_execution=False, **kwargs
                     )
 
-                async def ascore(self, reasoning: bool = True, n: int = 1, **kwargs):
+                async def ascore(self, llm: t.Optional[RagasLLM] = None, **kwargs):
                     """Asynchronous scoring method."""
+                    # Prepare function arguments based on what the function expects
+                    func_kwargs = kwargs.copy()
+                    func_args = []
+                    
+                    if expects_llm:
+                        func_args.append(llm)
+                    if expects_prompt:
+                        func_args.append(self.prompt)
+                    
                     if is_async:
                         # For async functions, await the result
-                        result = await func(self.llm, self.prompt, **kwargs)
-                        return self._extract_result(result, reasoning)
+                        result = await func(*func_args, **func_kwargs)
                     else:
                         # For sync functions, run normally
                         result = self._run_sync_in_async(
-                            func, self.llm, self.prompt, **kwargs
+                            func, *func_args, **func_kwargs
                         )
-                        return result
+                    
+                    # Ensure result is a MetricResult
+                    if not isinstance(result, MetricResult):
+                        raise ValueError(f"Custom metric function must return MetricResult, got {type(result)}")
+                    
+                    # Validate the result based on metric type
+                    validation_error = self._validate_result_value(result.result)
+                    if validation_error:
+                        return MetricResult(result=None, reason=validation_error)
+                    
+                    return result
 
             # Create the metric instance with all parameters
             metric_instance = CustomMetric(
-                name=metric_name, prompt=prompt, llm=llm, **metric_params
+                name=metric_name,**metric_params
             )
 
             # Preserve metadata
diff --git a/experimental/ragas_experimental/metric/discrete.py b/experimental/ragas_experimental/metric/discrete.py
@@ -8,48 +8,22 @@
 # %% ../../nbs/api/metric/discrete.ipynb 2
 import typing as t
 from dataclasses import dataclass, field
-from pydantic import BaseModel, create_model
-from collections import Counter
-from . import Metric, MetricResult
+from pydantic import create_model
+from . import Metric
 from .decorator import create_metric_decorator
 
 
 @dataclass
 class DiscreteMetric(Metric):
     values: t.List[str] = field(default_factory=lambda: ["pass", "fail"])
 
-    def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]:
-        """Get or create a response model based on reasoning parameter."""
-
-        if with_reasoning in self._response_models:
-            return self._response_models[with_reasoning]
-
-        model_name = "response_model"
+    def __post_init__(self):
+        super().__post_init__()
         values = tuple(self.values)
-        fields = {"result": (t.Literal[values], ...)}
-
-        if with_reasoning:
-            fields["reason"] = (str, ...)  # type: ignore
-
-        model = create_model(model_name, **fields)  # type: ignore
-        self._response_models[with_reasoning] = model
-        return model
-
-    def _ensemble(self, results: t.List[MetricResult]) -> MetricResult:
-
-        if len(results) == 1:
-            return results[0]
-
-        candidates = [candidate.result for candidate in results]
-        counter = Counter(candidates)
-        max_count = max(counter.values())
-        for candidate in results:
-            if counter[candidate.result] == max_count:
-                result = candidate.result
-                reason = candidate.reason
-                return MetricResult(result=result, reason=reason)
+        self._response_model = create_model("response_model", 
+                           result=(t.Literal[values], ...),
+                           reason=(str, ...))
 
-        return results[0]
 
 
 discrete_metric = create_metric_decorator(DiscreteMetric)
diff --git a/experimental/ragas_experimental/metric/numeric.py b/experimental/ragas_experimental/metric/numeric.py
@@ -8,41 +8,19 @@
 # %% ../../nbs/api/metric/numeric.ipynb 2
 import typing as t
 from dataclasses import dataclass, field
-from pydantic import BaseModel, create_model
-from . import Metric, MetricResult
+from pydantic import create_model
+from . import Metric
 from .decorator import create_metric_decorator
 
 
 @dataclass
 class NumericMetric(Metric):
-    range: t.Tuple[float, float]
+    range: t.Tuple[float, float] = (0.0, 1.0)
 
-    def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]:
-        """Get or create a response model based on reasoning parameter."""
+    def __post_init__(self):
+        super().__post_init__()
+        self._response_model = create_model("response_model", result=(float, ...))
 
-        if with_reasoning in self._response_models:
-            return self._response_models[with_reasoning]
-
-        model_name = "response_model"
-        fields = {"result": (float, ...)}
-
-        if with_reasoning:
-            fields["reason"] = (str, ...)  # type: ignore
-
-        model = create_model(model_name, **fields)
-        self._response_models[with_reasoning] = model
-        return model
-
-    def _ensemble(self, results: t.List[MetricResult]) -> MetricResult:
-
-        if len(results) == 1:
-            return results[0]
-
-        candidates = [candidate.result for candidate in results]
-        result = sum(candidates) / len(candidates)
-        reason = results[0].reason
-
-        return MetricResult(result=result, reason=reason)
 
 
 numeric_metric = create_metric_decorator(NumericMetric)
diff --git a/experimental/ragas_experimental/metric/ranking.py b/experimental/ragas_experimental/metric/ranking.py
diff --git a/experimental/ragas_experimental/metric/result.py b/experimental/ragas_experimental/metric/result.py