@@ -20,13 +20,11 @@ import (
20
20
"context"
21
21
"fmt"
22
22
"sort"
23
- "strings"
24
23
25
24
corev1 "k8s.io/api/core/v1"
26
25
"k8s.io/apimachinery/pkg/api/equality"
27
26
"k8s.io/apimachinery/pkg/api/errors"
28
27
"k8s.io/apimachinery/pkg/runtime"
29
- "k8s.io/apimachinery/pkg/types"
30
28
"k8s.io/client-go/tools/record"
31
29
ctrl "sigs.k8s.io/controller-runtime"
32
30
"sigs.k8s.io/controller-runtime/pkg/client"
@@ -219,12 +217,12 @@ func (r *TensorFusionWorkloadReconciler) Reconcile(ctx context.Context, req ctrl
219
217
func (r * TensorFusionWorkloadReconciler ) tryStartWorker (
220
218
ctx context.Context ,
221
219
workerGenerator * worker.WorkerGenerator ,
222
- gpus [] * tfv1.GPU ,
220
+ gpu * tfv1.GPU ,
223
221
workload * tfv1.TensorFusionWorkload ,
224
222
hash string ,
225
223
) (* corev1.Pod , error ) {
226
224
port := workerGenerator .AllocPort ()
227
- pod , hash , err := workerGenerator .GenerateWorkerPod (gpus , fmt .Sprintf ("%s-tf-worker-" , workload .Name ), workload .Namespace , port , workload .Spec .Resources .Limits , hash )
225
+ pod , hash , err := workerGenerator .GenerateWorkerPod (gpu , fmt .Sprintf ("%s-tf-worker-" , workload .Name ), workload .Namespace , port , workload .Spec .Resources .Limits , hash )
228
226
if err != nil {
229
227
return nil , fmt .Errorf ("generate worker pod %w" , err )
230
228
}
@@ -233,18 +231,9 @@ func (r *TensorFusionWorkloadReconciler) tryStartWorker(
233
231
if pod .Labels == nil {
234
232
pod .Labels = make (map [string ]string )
235
233
}
236
-
237
- if pod .Annotations == nil {
238
- pod .Annotations = make (map [string ]string )
239
- }
240
-
241
- gpuNames := lo .Map (gpus , func (gpu * tfv1.GPU , _ int ) string {
242
- return gpu .Name
243
- })
244
-
245
234
pod .Labels [constants .WorkloadKey ] = workload .Name
235
+ pod .Labels [constants .GpuKey ] = gpu .Name
246
236
pod .Labels [constants .LabelKeyPodTemplateHash ] = hash
247
- pod .Annotations [constants .GpuKey ] = strings .Join (gpuNames , "," )
248
237
249
238
// Add finalizer for GPU resource cleanup
250
239
pod .Finalizers = append (pod .Finalizers , constants .Finalizer )
@@ -280,7 +269,6 @@ func (r *TensorFusionWorkloadReconciler) scaleDownWorkers(ctx context.Context, w
280
269
metrics .GpuTflopsLimit .Delete (labels )
281
270
metrics .VramBytesRequest .Delete (labels )
282
271
metrics .VramBytesLimit .Delete (labels )
283
- metrics .GpuCount .Delete (labels )
284
272
}
285
273
return nil
286
274
}
@@ -291,24 +279,26 @@ func (r *TensorFusionWorkloadReconciler) handlePodGPUCleanup(ctx context.Context
291
279
292
280
log .Info ("Processing pod with GPU resource cleanup finalizer" , "pod" , pod .Name )
293
281
294
- // read the GPU names from the pod annotations
295
- gpuNamesStr , ok := pod .Annotations [constants .GpuKey ]
282
+ // Get GPU name from pod label
283
+ gpuName , ok := pod .Labels [constants .GpuKey ]
296
284
if ! ok {
297
285
log .Info ("Pod has finalizer but no GPU label" , "pod" , pod .Name )
298
286
return true , nil
299
287
}
300
288
301
- // Split GPU names by comma
302
- gpuNames := strings .Split (gpuNamesStr , "," )
303
- gpus := lo .Map (gpuNames , func (gpuName string , _ int ) types.NamespacedName {
304
- return types.NamespacedName {Name : gpuName }
305
- })
306
- // Release GPU resources
307
- if err := r .Allocator .Dealloc (ctx , workload .Spec .Resources .Requests , gpus ); err != nil {
308
- log .Error (err , "Failed to release GPU resources, will retry" , "gpus" , gpus , "pod" , pod .Name )
289
+ // Get the GPU
290
+ gpu := & tfv1.GPU {}
291
+ if err := r .Get (ctx , client.ObjectKey {Name : gpuName }, gpu ); err != nil {
292
+ if errors .IsNotFound (err ) {
293
+ // GPU not found, just continue
294
+ log .Info ("GPU not found" , "gpu" , gpuName , "pod" , pod .Name )
295
+ return true , nil
296
+ }
297
+ // Error getting GPU, retry later
298
+ log .Error (err , "Failed to get GPU" , "gpu" , gpuName , "pod" , pod .Name )
309
299
return false , err
310
300
}
311
- log . Info ( "Released GPU resources via finalizer" , "gpus" , gpus , "pod" , pod . Name )
301
+
312
302
if pod .Annotations == nil {
313
303
pod .Annotations = make (map [string ]string )
314
304
}
@@ -320,10 +310,17 @@ func (r *TensorFusionWorkloadReconciler) handlePodGPUCleanup(ctx context.Context
320
310
// not yet reflecting the finalizer's removal), Then this r.Update pod will fail.
321
311
// Will not cause duplicate releases
322
312
if err := r .Update (ctx , pod ); err != nil {
323
- log .Error (err , "Failed to mark that GPU cleanup of pod" )
313
+ log .Error (err , "Failed to mark that GPU cleanup of pod" , "gpu" , gpuName , "pod" , pod .Name )
314
+ return false , err
315
+ }
316
+
317
+ // Release GPU resources
318
+ if err := r .Allocator .Dealloc (ctx , workload .Spec .Resources .Requests , gpu ); err != nil {
319
+ log .Error (err , "Failed to release GPU resources, will retry" , "gpu" , gpuName , "pod" , pod .Name )
324
320
return false , err
325
321
}
326
322
323
+ log .Info ("Released GPU resources via finalizer" , "gpu" , gpuName , "pod" , pod .Name )
327
324
return true , nil
328
325
}
329
326
@@ -347,21 +344,21 @@ func (r *TensorFusionWorkloadReconciler) scaleUpWorkers(ctx context.Context, wor
347
344
// Create worker pods
348
345
for range count {
349
346
// Schedule GPU for the worker
350
- gpus , err := r .Allocator .Alloc (ctx , workload .Spec .PoolName , workload .Spec .Resources .Requests , workload . Spec . GPUCount )
347
+ gpus , err := r .Allocator .Alloc (ctx , workload .Spec .PoolName , workload .Spec .Resources .Requests , 1 )
351
348
if err != nil {
352
349
r .Recorder .Eventf (workload , corev1 .EventTypeWarning , "ScheduleGPUFailed" , "Failed to schedule GPU: %v" , err )
353
350
return ctrl.Result {RequeueAfter : constants .PendingRequeueDuration }, nil
354
351
}
355
352
356
- pod , err := r .tryStartWorker (ctx , workerGenerator , gpus , workload , hash )
353
+ // Use the first GPU from the allocated array
354
+ gpu := gpus [0 ]
355
+
356
+ pod , err := r .tryStartWorker (ctx , workerGenerator , gpu , workload , hash )
357
357
if err != nil {
358
- // Try to release all allocated GPUs if pod creation fails
359
- gpus := lo .Map (gpus , func (gpu * tfv1.GPU , _ int ) types.NamespacedName {
360
- return client .ObjectKeyFromObject (gpu )
361
- })
362
- releaseErr := r .Allocator .Dealloc (ctx , workload .Spec .Resources .Requests , gpus )
358
+ // Try to release the GPU resource if pod creation fails
359
+ releaseErr := r .Allocator .Dealloc (ctx , workload .Spec .Resources .Requests , gpu )
363
360
if releaseErr != nil {
364
- log .Error (releaseErr , "Failed to release GPU after pod creation failure" , "gpus" , gpus )
361
+ log .Error (releaseErr , "Failed to release GPU after pod creation failure" )
365
362
}
366
363
return ctrl.Result {}, fmt .Errorf ("create worker pod: %w" , err )
367
364
}
@@ -375,7 +372,6 @@ func (r *TensorFusionWorkloadReconciler) scaleUpWorkers(ctx context.Context, wor
375
372
metrics .GpuTflopsLimit .With (labels ).Set (workload .Spec .Resources .Limits .Tflops .AsApproximateFloat64 ())
376
373
metrics .VramBytesRequest .With (labels ).Set (workload .Spec .Resources .Requests .Vram .AsApproximateFloat64 ())
377
374
metrics .VramBytesLimit .With (labels ).Set (workload .Spec .Resources .Limits .Vram .AsApproximateFloat64 ())
378
- metrics .GpuCount .With (labels ).Set (float64 (workload .Spec .GPUCount ))
379
375
}
380
376
381
377
return ctrl.Result {}, nil
0 commit comments