@@ -54,6 +54,7 @@ def __init__(
5454 return_all_scores = False ,
5555 return_outputs = False ,
5656 provide_traceback = False ,
57+ failure_score = 0.0 ,
5758 ** _kwargs ,
5859 ):
5960 self .devset = devset
@@ -65,6 +66,7 @@ def __init__(
6566 self .return_all_scores = return_all_scores
6667 self .return_outputs = return_outputs
6768 self .provide_traceback = provide_traceback
69+ self .failure_score = failure_score
6870
6971 def __call__ (
7072 self ,
@@ -85,7 +87,6 @@ def __call__(
8587 return_all_scores = return_all_scores if return_all_scores is not None else self .return_all_scores
8688 return_outputs = return_outputs if return_outputs is not None else self .return_outputs
8789
88- devset = list (enumerate (devset ))
8990 tqdm .tqdm ._instances .clear ()
9091
9192 executor = ParallelExecutor (
@@ -96,39 +97,27 @@ def __call__(
9697 compare_results = True ,
9798 )
9899
99- def process_item (item ):
100- try :
101- example_idx , example = item
102- prediction = program (** example .inputs ())
103- score = metric (example , prediction )
100+ def process_item (example ):
101+ prediction = program (** example .inputs ())
102+ score = metric (example , prediction )
104103
105- # Increment assert and suggest failures to program's attributes
106- if hasattr (program , "_assert_failures" ):
107- program ._assert_failures += dspy .settings .get ("assert_failures" )
108- if hasattr (program , "_suggest_failures" ):
109- program ._suggest_failures += dspy .settings .get ("suggest_failures" )
104+ # Increment assert and suggest failures to program's attributes
105+ if hasattr (program , "_assert_failures" ):
106+ program ._assert_failures += dspy .settings .get ("assert_failures" )
107+ if hasattr (program , "_suggest_failures" ):
108+ program ._suggest_failures += dspy .settings .get ("suggest_failures" )
110109
111- return example_idx , example , prediction , score
112- except Exception :
113- return example_idx , example , {}, 0.0
110+ return prediction , score
114111
115112 results = executor .execute (process_item , devset )
116- reordered_devset = [ r for r in results if r is not None ]
113+ assert len ( devset ) == len ( results )
117114
118- ncorrect = sum (score for _ , _ , _ , score in reordered_devset )
119- ntotal = len (reordered_devset )
120-
121- if ntotal == 0 :
122- logger .warning ("No valid results to compute metrics." )
123- return 0.0
115+ results = [((dspy .Prediction (), self .failure_score ) if r is None else r ) for r in results ]
116+ results = [(example , prediction , score ) for example , (prediction , score ) in zip (devset , results )]
117+ ncorrect , ntotal = sum (score for * _ , score in results ), len (devset )
124118
125119 logger .info (f"Average Metric: { ncorrect } / { ntotal } ({ round (100 * ncorrect / ntotal , 1 )} %)" )
126-
127- predicted_devset = sorted (reordered_devset )
128-
129- if return_outputs : # Handle the return_outputs logic
130- results = [(example , prediction , score ) for _ , example , prediction , score in predicted_devset ]
131-
120+
132121 def prediction_is_dictlike (prediction ):
133122 # Downstream logic for displaying dictionary-like predictions depends solely on the predictions
134123 # having a method called `items()` for iterating through key/value pairs
@@ -140,12 +129,12 @@ def prediction_is_dictlike(prediction):
140129 if prediction_is_dictlike (prediction )
141130 else dict (example ) | {"prediction" : prediction , "correct" : score }
142131 )
143- for _ , example , prediction , score in predicted_devset
132+ for example , prediction , score in results
144133 ]
145134
146- result_df = pd .DataFrame (data )
147135
148136 # Truncate every cell in the DataFrame (DataFrame.applymap was renamed to DataFrame.map in Pandas 2.1.0)
137+ result_df = pd .DataFrame (data )
149138 result_df = result_df .map (truncate_cell ) if hasattr (result_df , "map" ) else result_df .applymap (truncate_cell )
150139
151140 # Rename the 'correct' column to the name of the metric object
@@ -179,9 +168,9 @@ def prediction_is_dictlike(prediction):
179168 display (HTML (message ))
180169
181170 if return_all_scores and return_outputs :
182- return round (100 * ncorrect / ntotal , 2 ), results , [score for * _ , score in predicted_devset ]
171+ return round (100 * ncorrect / ntotal , 2 ), results , [score for * _ , score in results ]
183172 if return_all_scores :
184- return round (100 * ncorrect / ntotal , 2 ), [score for * _ , score in predicted_devset ]
173+ return round (100 * ncorrect / ntotal , 2 ), [score for * _ , score in results ]
185174 if return_outputs :
186175 return round (100 * ncorrect / ntotal , 2 ), results
187176
0 commit comments