Skip to content

Commit 5cbfd14

Browse files
authored
fix: double dealloc gpu issue (#218)
1 parent 3f1e728 commit 5cbfd14

File tree

1 file changed

+12
-11
lines changed

1 file changed

+12
-11
lines changed

internal/controller/tensorfusionworkload_controller.go

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,18 @@ func (r *TensorFusionWorkloadReconciler) handlePodGPUCleanup(ctx context.Context
314314

315315
log.Info("Processing pod with GPU resource cleanup finalizer", "pod", pod.Name)
316316

317+
pod.Annotations[constants.GpuReleasedAnnotation] = shortuuid.New()
318+
319+
// Update the annotation of the Pod to mark that GPU cleanup has been successfully processed.
320+
// This is a key part of ensuring idempotency for the handlePodGPUCleanup function.
321+
// If this function is called again for the same Pod instance (e.g., due to the client cache
322+
// not yet reflecting the finalizer's removal), Then this r.Update pod will fail.
323+
// Will not cause duplicate releases
324+
if err := r.Update(ctx, pod); err != nil {
325+
log.Error(err, "Failed to mark that GPU cleanup of pod")
326+
return false, err
327+
}
328+
317329
// read the GPU names from the pod annotations
318330
gpuNamesStr, ok := pod.Annotations[constants.GpuKey]
319331
if !ok {
@@ -335,17 +347,6 @@ func (r *TensorFusionWorkloadReconciler) handlePodGPUCleanup(ctx context.Context
335347
if pod.Annotations == nil {
336348
pod.Annotations = make(map[string]string)
337349
}
338-
pod.Annotations[constants.GpuReleasedAnnotation] = shortuuid.New()
339-
340-
// Update the annotation of the Pod to mark that GPU cleanup has been successfully processed.
341-
// This is a key part of ensuring idempotency for the handlePodGPUCleanup function.
342-
// If this function is called again for the same Pod instance (e.g., due to the client cache
343-
// not yet reflecting the finalizer's removal), Then this r.Update pod will fail.
344-
// Will not cause duplicate releases
345-
if err := r.Update(ctx, pod); err != nil {
346-
log.Error(err, "Failed to mark that GPU cleanup of pod")
347-
return false, err
348-
}
349350

350351
return true, nil
351352
}

0 commit comments

Comments
 (0)