Skip to content

Commit 44b3331

Browse files
authored
Fix dspy.Evaluate's handling of exceptions (from 2.5.30) (#1839)
1 parent ee6c166 commit 44b3331

File tree

1 file changed

+20
-31
lines changed

1 file changed

+20
-31
lines changed

dspy/evaluate/evaluate.py

Lines changed: 20 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ def __init__(
5454
return_all_scores=False,
5555
return_outputs=False,
5656
provide_traceback=False,
57+
failure_score=0.0,
5758
**_kwargs,
5859
):
5960
self.devset = devset
@@ -65,6 +66,7 @@ def __init__(
6566
self.return_all_scores = return_all_scores
6667
self.return_outputs = return_outputs
6768
self.provide_traceback = provide_traceback
69+
self.failure_score = failure_score
6870

6971
def __call__(
7072
self,
@@ -85,7 +87,6 @@ def __call__(
8587
return_all_scores = return_all_scores if return_all_scores is not None else self.return_all_scores
8688
return_outputs = return_outputs if return_outputs is not None else self.return_outputs
8789

88-
devset = list(enumerate(devset))
8990
tqdm.tqdm._instances.clear()
9091

9192
executor = ParallelExecutor(
@@ -96,39 +97,27 @@ def __call__(
9697
compare_results=True,
9798
)
9899

99-
def process_item(item):
100-
try:
101-
example_idx, example = item
102-
prediction = program(**example.inputs())
103-
score = metric(example, prediction)
100+
def process_item(example):
101+
prediction = program(**example.inputs())
102+
score = metric(example, prediction)
104103

105-
# Increment assert and suggest failures to program's attributes
106-
if hasattr(program, "_assert_failures"):
107-
program._assert_failures += dspy.settings.get("assert_failures")
108-
if hasattr(program, "_suggest_failures"):
109-
program._suggest_failures += dspy.settings.get("suggest_failures")
104+
# Increment assert and suggest failures to program's attributes
105+
if hasattr(program, "_assert_failures"):
106+
program._assert_failures += dspy.settings.get("assert_failures")
107+
if hasattr(program, "_suggest_failures"):
108+
program._suggest_failures += dspy.settings.get("suggest_failures")
110109

111-
return example_idx, example, prediction, score
112-
except Exception:
113-
return example_idx, example, {}, 0.0
110+
return prediction, score
114111

115112
results = executor.execute(process_item, devset)
116-
reordered_devset = [r for r in results if r is not None]
113+
assert len(devset) == len(results)
117114

118-
ncorrect = sum(score for _, _, _, score in reordered_devset)
119-
ntotal = len(reordered_devset)
120-
121-
if ntotal == 0:
122-
logger.warning("No valid results to compute metrics.")
123-
return 0.0
115+
results = [((dspy.Prediction(), self.failure_score) if r is None else r) for r in results]
116+
results = [(example, prediction, score) for example, (prediction, score) in zip(devset, results)]
117+
ncorrect, ntotal = sum(score for *_, score in results), len(devset)
124118

125119
logger.info(f"Average Metric: {ncorrect} / {ntotal} ({round(100 * ncorrect / ntotal, 1)}%)")
126-
127-
predicted_devset = sorted(reordered_devset)
128-
129-
if return_outputs: # Handle the return_outputs logic
130-
results = [(example, prediction, score) for _, example, prediction, score in predicted_devset]
131-
120+
132121
def prediction_is_dictlike(prediction):
133122
# Downstream logic for displaying dictionary-like predictions depends solely on the predictions
134123
# having a method called `items()` for iterating through key/value pairs
@@ -140,12 +129,12 @@ def prediction_is_dictlike(prediction):
140129
if prediction_is_dictlike(prediction)
141130
else dict(example) | {"prediction": prediction, "correct": score}
142131
)
143-
for _, example, prediction, score in predicted_devset
132+
for example, prediction, score in results
144133
]
145134

146-
result_df = pd.DataFrame(data)
147135

148136
# Truncate every cell in the DataFrame (DataFrame.applymap was renamed to DataFrame.map in Pandas 2.1.0)
137+
result_df = pd.DataFrame(data)
149138
result_df = result_df.map(truncate_cell) if hasattr(result_df, "map") else result_df.applymap(truncate_cell)
150139

151140
# Rename the 'correct' column to the name of the metric object
@@ -179,9 +168,9 @@ def prediction_is_dictlike(prediction):
179168
display(HTML(message))
180169

181170
if return_all_scores and return_outputs:
182-
return round(100 * ncorrect / ntotal, 2), results, [score for *_, score in predicted_devset]
171+
return round(100 * ncorrect / ntotal, 2), results, [score for *_, score in results]
183172
if return_all_scores:
184-
return round(100 * ncorrect / ntotal, 2), [score for *_, score in predicted_devset]
173+
return round(100 * ncorrect / ntotal, 2), [score for *_, score in results]
185174
if return_outputs:
186175
return round(100 * ncorrect / ntotal, 2), results
187176

0 commit comments

Comments
 (0)