langwatch
diff --git a/‎evaluators/langevals/langevals_langevals/llm_answer_match.py
Lines changed: 1 addition & 1 deletion b/‎evaluators/langevals/langevals_langevals/llm_answer_match.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎evaluators/langevals/langevals_langevals/llm_boolean.py
Lines changed: 56 additions & 43 deletions b/‎evaluators/langevals/langevals_langevals/llm_boolean.py
Lines changed: 56 additions & 43 deletions
diff --git a/‎evaluators/langevals/langevals_langevals/llm_category.py
Lines changed: 65 additions & 45 deletions b/‎evaluators/langevals/langevals_langevals/llm_category.py
Lines changed: 65 additions & 45 deletions
@@ -58,7 +58,7 @@ class LLMAnswerMatchEvaluator(
 
     def evaluate(self, entry: LLMAnswerMatchEntry) -> SingleEvaluationResult:
         total_tokens = len(
-            litellm.encode(
+            litellm.encode( # type: ignore
                 model=self.settings.model,
                 text=f"{entry.input} {entry.output} {entry.expected_output}",
             )
 
@@ -16,6 +16,7 @@
 from litellm import Choices, Message
 from litellm.files.main import ModelResponse
 from litellm.cost_calculator import completion_cost
+import dspy
 
 
 class CustomLLMBooleanEntry(EvaluatorEntry):
@@ -33,7 +34,7 @@ class CustomLLMBooleanSettings(LLMEvaluatorSettings):
 
 
 class CustomLLMBooleanResult(EvaluationResult):
-    score: float
+    score: float = Field(default=0.0)
     passed: Optional[bool] = Field(
         description="The veredict given by the LLM", default=True
     )
@@ -74,7 +75,7 @@ def evaluate(self, entry: CustomLLMBooleanEntry) -> SingleEvaluationResult:
         content += f"# Task\n{self.settings.prompt}"
 
         total_tokens = len(
-            litellm.encode(
+            litellm.encode(  # type: ignore
                 model=self.settings.model, text=f"{self.settings.prompt} {content}"
             )
         )
@@ -86,55 +87,67 @@ def evaluate(self, entry: CustomLLMBooleanEntry) -> SingleEvaluationResult:
 
         cost = None
 
-        response = litellm.completion(
-            model=self.settings.model,
-            messages=[
-                {
-                    "role": "system",
-                    "content": self.settings.prompt
-                    + ". Always output a valid json for the function call",
-                },
-                {
-                    "role": "user",
-                    "content": content,
-                },
-            ],
-            tools=[
-                {
-                    "type": "function",
-                    "function": {
-                        "name": "evaluation",
-                        "parameters": {
-                            "type": "object",
-                            "properties": {
-                                "scratchpad": {
-                                    "type": "string",
-                                    "description": "use this field to ponder and write a short reasoning behind the decision written before a result is actually given",
-                                },
-                                "passed": {
-                                    "type": "boolean",
-                                    "description": "your final veredict, reply true or false if the content passes the test or not",
+        if "atla-selene" in self.settings.model:
+
+            class LLMJudge(dspy.Signature):
+                content: str = dspy.InputField()
+                reasoning: str = dspy.OutputField()
+                passed: bool = dspy.OutputField()
+
+            judge = dspy.Predict(LLMJudge.with_instructions(self.settings.prompt))
+            judge.set_lm(lm=dspy.LM(model=self.settings.model))
+            arguments = judge(content=content)
+
+        else:
+            response = litellm.completion(
+                model=self.settings.model,
+                messages=[
+                    {
+                        "role": "system",
+                        "content": self.settings.prompt
+                        + ". Always output a valid json for the function call",
+                    },
+                    {
+                        "role": "user",
+                        "content": content,
+                    },
+                ],
+                tools=[
+                    {
+                        "type": "function",
+                        "function": {
+                            "name": "evaluation",
+                            "parameters": {
+                                "type": "object",
+                                "properties": {
+                                    "reasoning": {
+                                        "type": "string",
+                                        "description": "use this field to ponder and write a short reasoning behind the decision written before a result is actually given",
+                                    },
+                                    "passed": {
+                                        "type": "boolean",
+                                        "description": "your final veredict, reply true or false if the content passes the test or not",
+                                    },
                                 },
+                                "required": ["reasoning", "passed"],
                             },
-                            "required": ["scratchpad", "passed"],
+                            "description": "use this function to write your thoughts on the reasoning, then decide if it passed or not with this json structure",
                         },
-                        "description": "use this function to write your thoughts on the scratchpad, then decide if it passed or not with this json structure",
                     },
-                },
-            ],
-            tool_choice={"type": "function", "function": {"name": "evaluation"}},  # type: ignore
-        )
+                ],
+                tool_choice={"type": "function", "function": {"name": "evaluation"}},  # type: ignore
+            )
 
-        response = cast(ModelResponse, response)
-        choice = cast(Choices, response.choices[0])
-        arguments = json.loads(
-            cast(Message, choice.message).tool_calls[0].function.arguments
-        )
-        cost = completion_cost(completion_response=response)
+            response = cast(ModelResponse, response)
+            choice = cast(Choices, response.choices[0])
+            arguments = json.loads(
+                cast(Message, choice.message).tool_calls[0].function.arguments  # type: ignore
+            )
+            cost = completion_cost(completion_response=response)
 
         return CustomLLMBooleanResult(
             score=1 if arguments["passed"] else 0,
             passed=arguments["passed"],
-            details=arguments["scratchpad"],
+            details=arguments["reasoning"],
             cost=Money(amount=cost, currency="USD") if cost else None,
         )
@@ -17,6 +17,7 @@
 from litellm.types.utils import ModelResponse
 from litellm.cost_calculator import completion_cost
 from litellm.utils import encode
+import dspy
 
 
 class CustomLLMCategoryEntry(EvaluatorEntry):
@@ -109,58 +110,77 @@ def evaluate(self, entry: CustomLLMCategoryEntry) -> SingleEvaluationResult:
 
         cost = None
 
-        response = litellm.completion(
-            model=self.settings.model,
-            messages=[
-                {
-                    "role": "system",
-                    "content": self.settings.prompt
-                    + ". Always output a valid json for the function call",
-                },
-                {
-                    "role": "user",
-                    "content": content,
-                },
-            ],
-            tools=[
-                {
-                    "type": "function",
-                    "function": {
-                        "name": "evaluation",
-                        "parameters": {
-                            "type": "object",
-                            "properties": {
-                                "scratchpad": {
-                                    "type": "string",
-                                    "description": "use this field to ponder and write a short reasoning behind the decision written before a result is actually given",
-                                },
-                                "label": {
-                                    "type": "string",
-                                    "description": "the final decision of the category for the message",
-                                    "enum": [
-                                        category.name
-                                        for category in self.settings.categories
-                                    ],
+        if "atla-selene" in self.settings.model:
+            # Workaround to get the Literal type for the categories at runtime
+            category_names = [
+                f'"{category.name}"' for category in self.settings.categories
+            ]
+            type_str = f"Literal[{', '.join(category_names)}]"
+            locals_dict = {"Literal": Literal}
+            type_ = eval(type_str, globals(), locals_dict)
+
+            class LLMJudge(dspy.Signature):
+                content: str = dspy.InputField()
+                reasoning: str = dspy.OutputField()
+                label: type_ = dspy.OutputField()  # type: ignore
+
+            judge = dspy.Predict(LLMJudge.with_instructions(self.settings.prompt))
+            judge.set_lm(lm=dspy.LM(model=self.settings.model))
+            arguments = judge(content=content)
+
+        else:
+            response = litellm.completion(
+                model=self.settings.model,
+                messages=[
+                    {
+                        "role": "system",
+                        "content": self.settings.prompt
+                        + ". Always output a valid json for the function call",
+                    },
+                    {
+                        "role": "user",
+                        "content": content,
+                    },
+                ],
+                tools=[
+                    {
+                        "type": "function",
+                        "function": {
+                            "name": "evaluation",
+                            "parameters": {
+                                "type": "object",
+                                "properties": {
+                                    "reasoning": {
+                                        "type": "string",
+                                        "description": "use this field to ponder and write a short reasoning behind the decision written before a result is actually given",
+                                    },
+                                    "label": {
+                                        "type": "string",
+                                        "description": "the final decision of the category for the message",
+                                        "enum": [
+                                            category.name
+                                            for category in self.settings.categories
+                                        ],
+                                    },
                                 },
+                                "required": ["reasoning", "label"],
                             },
-                            "required": ["scratchpad", "label"],
+                            "description": "use this function to write your thoughts on the reasoning, then decide if it passed or not with this json structure",
                         },
-                        "description": "use this function to write your thoughts on the scratchpad, then decide if it passed or not with this json structure",
                     },
-                },
-            ],
-            tool_choice={"type": "function", "function": {"name": "evaluation"}},  # type: ignore
-        )
+                ],
+                tool_choice={"type": "function", "function": {"name": "evaluation"}},  # type: ignore
+            )
 
-        response = cast(ModelResponse, response)
-        choice = cast(Choices, response.choices[0])
-        arguments = json.loads(
-            cast(Message, choice.message).tool_calls[0].function.arguments  # type: ignore
-        )
-        cost = completion_cost(completion_response=response)
+            response = cast(ModelResponse, response)
+            choice = cast(Choices, response.choices[0])
+            arguments = json.loads(
+                cast(Message, choice.message).tool_calls[0].function.arguments  # type: ignore
+            )
+            cost = completion_cost(completion_response=response)
 
         return CustomLLMCategoryResult(
             label=arguments["label"],
-            details=arguments["scratchpad"],
+            details=arguments["reasoning"],
             cost=Money(amount=cost, currency="USD") if cost else None,
         )
Original file line number	Diff line number	Diff line change
`@@ -58,7 +58,7 @@ class LLMAnswerMatchEvaluator(`
`58`	`58`
`59`	`59`	`def evaluate(self, entry: LLMAnswerMatchEntry) -> SingleEvaluationResult:`
`60`	`60`	`total_tokens = len(`
`61`		`- litellm.encode(`
	`61`	`+ litellm.encode( # type: ignore`
`62`	`62`	`model=self.settings.model,`
`63`	`63`	`text=f"{entry.input} {entry.output} {entry.expected_output}",`
`64`	`64`	`)`