@@ -314,6 +314,18 @@ func (r *TensorFusionWorkloadReconciler) handlePodGPUCleanup(ctx context.Context
314
314
315
315
log .Info ("Processing pod with GPU resource cleanup finalizer" , "pod" , pod .Name )
316
316
317
+ pod .Annotations [constants .GpuReleasedAnnotation ] = shortuuid .New ()
318
+
319
+ // Update the annotation of the Pod to mark that GPU cleanup has been successfully processed.
320
+ // This is a key part of ensuring idempotency for the handlePodGPUCleanup function.
321
+ // If this function is called again for the same Pod instance (e.g., due to the client cache
322
+ // not yet reflecting the finalizer's removal), Then this r.Update pod will fail.
323
+ // Will not cause duplicate releases
324
+ if err := r .Update (ctx , pod ); err != nil {
325
+ log .Error (err , "Failed to mark that GPU cleanup of pod" )
326
+ return false , err
327
+ }
328
+
317
329
// read the GPU names from the pod annotations
318
330
gpuNamesStr , ok := pod .Annotations [constants .GpuKey ]
319
331
if ! ok {
@@ -335,17 +347,6 @@ func (r *TensorFusionWorkloadReconciler) handlePodGPUCleanup(ctx context.Context
335
347
if pod .Annotations == nil {
336
348
pod .Annotations = make (map [string ]string )
337
349
}
338
- pod .Annotations [constants .GpuReleasedAnnotation ] = shortuuid .New ()
339
-
340
- // Update the annotation of the Pod to mark that GPU cleanup has been successfully processed.
341
- // This is a key part of ensuring idempotency for the handlePodGPUCleanup function.
342
- // If this function is called again for the same Pod instance (e.g., due to the client cache
343
- // not yet reflecting the finalizer's removal), Then this r.Update pod will fail.
344
- // Will not cause duplicate releases
345
- if err := r .Update (ctx , pod ); err != nil {
346
- log .Error (err , "Failed to mark that GPU cleanup of pod" )
347
- return false , err
348
- }
349
350
350
351
return true , nil
351
352
}
0 commit comments