@@ -108,7 +108,9 @@ func (s *GpuAllocator) Alloc(
108
108
s .storeMutex .Lock ()
109
109
defer s .storeMutex .Unlock ()
110
110
111
+ appAdded := false
111
112
for _ , selectedGPU := range selectedGPUs {
113
+
112
114
// Get the GPU from the store
113
115
key := types.NamespacedName {Name : selectedGPU .Name , Namespace : selectedGPU .Namespace }
114
116
gpu , exists := s .gpuStore [key ]
@@ -122,7 +124,10 @@ func (s *GpuAllocator) Alloc(
122
124
gpu .Status .Available .Tflops .Sub (request .Tflops )
123
125
gpu .Status .Available .Vram .Sub (request .Vram )
124
126
125
- addRunningApp (gpu , workloadNameNamespace )
127
+ if ! appAdded {
128
+ addRunningApp (ctx , gpu , workloadNameNamespace )
129
+ appAdded = true
130
+ }
126
131
127
132
s .markGPUDirty (key )
128
133
}
@@ -143,6 +148,7 @@ func (s *GpuAllocator) Dealloc(ctx context.Context, workloadNameNamespace tfv1.N
143
148
s .storeMutex .Lock ()
144
149
defer s .storeMutex .Unlock ()
145
150
151
+ appRemoved := false
146
152
for _ , gpu := range gpus {
147
153
// Get the GPU from the store
148
154
storeGPU , exists := s .gpuStore [gpu ]
@@ -154,6 +160,10 @@ func (s *GpuAllocator) Dealloc(ctx context.Context, workloadNameNamespace tfv1.N
154
160
// Add resources back to the GPU
155
161
storeGPU .Status .Available .Tflops .Add (request .Tflops )
156
162
storeGPU .Status .Available .Vram .Add (request .Vram )
163
+ if ! appRemoved {
164
+ removeRunningApp (ctx , storeGPU , workloadNameNamespace )
165
+ appRemoved = true
166
+ }
157
167
158
168
s .markGPUDirty (gpu )
159
169
}
@@ -468,11 +478,14 @@ func (s *GpuAllocator) reconcileAllocationState(ctx context.Context) {
468
478
469
479
tflopsCapacityMap := make (map [types.NamespacedName ]resource.Quantity )
470
480
vramCapacityMap := make (map [types.NamespacedName ]resource.Quantity )
481
+ gpuMap := make (map [types.NamespacedName ]* tfv1.GPU )
471
482
472
483
for gpuKey , gpu := range s .gpuStore {
473
484
if gpu .Status .Capacity != nil {
474
485
tflopsCapacityMap [gpuKey ] = gpu .Status .Capacity .Tflops
475
486
vramCapacityMap [gpuKey ] = gpu .Status .Capacity .Vram
487
+ gpu .Status .RunningApps = []* tfv1.RunningAppDetail {}
488
+ gpuMap [gpuKey ] = gpu
476
489
}
477
490
}
478
491
@@ -481,6 +494,7 @@ func (s *GpuAllocator) reconcileAllocationState(ctx context.Context) {
481
494
vramRequest , _ := resource .ParseQuantity (worker .Annotations [constants .VRAMRequestAnnotation ])
482
495
gpuIds := worker .Annotations [constants .GpuKey ]
483
496
gpuIdsList := strings .Split (gpuIds , "," )
497
+ appAdded := false
484
498
for _ , gpuId := range gpuIdsList {
485
499
gpuKey := types.NamespacedName {Name : gpuId }
486
500
gpuCapacity , ok := tflopsCapacityMap [gpuKey ]
@@ -491,6 +505,10 @@ func (s *GpuAllocator) reconcileAllocationState(ctx context.Context) {
491
505
if ok {
492
506
gpuCapacity .Sub (vramRequest )
493
507
}
508
+ if ! appAdded {
509
+ addRunningApp (ctx , gpuMap [gpuKey ], tfv1.NameNamespace {Namespace : worker .Namespace , Name : worker .Labels [constants .WorkloadKey ]})
510
+ appAdded = true
511
+ }
494
512
}
495
513
}
496
514
@@ -510,7 +528,11 @@ func (s *GpuAllocator) reconcileAllocationState(ctx context.Context) {
510
528
}
511
529
}
512
530
513
- func addRunningApp (gpu * tfv1.GPU , workloadNameNamespace tfv1.NameNamespace ) {
531
+ func addRunningApp (ctx context.Context , gpu * tfv1.GPU , workloadNameNamespace tfv1.NameNamespace ) {
532
+ if gpu == nil {
533
+ log .FromContext (ctx ).Info ("[Warning] GPU is nil, skip adding running app" , "workload" , workloadNameNamespace .Name , "namespace" , workloadNameNamespace .Namespace )
534
+ return
535
+ }
514
536
if gpu .Status .RunningApps == nil {
515
537
gpu .Status .RunningApps = []* tfv1.RunningAppDetail {}
516
538
}
0 commit comments