Skip to content

Commit e004a5d

Browse files
committed
fix: gpu allocator test case bug
1 parent fbce8c6 commit e004a5d

File tree

4 files changed

+62
-42
lines changed

4 files changed

+62
-42
lines changed

internal/controller/suite_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ var metricsRecorder *metrics.MetricsRecorder
6868

6969
func TestControllers(t *testing.T) {
7070
RegisterFailHandler(Fail)
71-
SetDefaultEventuallyTimeout(7 * time.Second)
71+
SetDefaultEventuallyTimeout(6 * time.Second)
7272
SetDefaultEventuallyPollingInterval(200 * time.Millisecond)
7373
SetDefaultConsistentlyDuration(5 * time.Second)
7474
SetDefaultConsistentlyPollingInterval(200 * time.Millisecond)

internal/gpuallocator/gpuallocator_suite_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -365,9 +365,9 @@ var _ = AfterSuite(func() {
365365
})
366366

367367
// Helper function to get a GPU from the API server
368-
func getGPU(name string, namespace string) *tfv1.GPU {
368+
func getGPU(name string) *tfv1.GPU {
369369
gpu := &tfv1.GPU{}
370-
key := types.NamespacedName{Name: name, Namespace: namespace}
370+
key := types.NamespacedName{Name: name}
371371
err := k8sClient.Get(ctx, key, gpu)
372372
ExpectWithOffset(1, err).NotTo(HaveOccurred())
373373
return gpu

internal/gpuallocator/gpuallocator_test.go

Lines changed: 30 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,20 @@ var workloadNameNs = tfv1.NameNamespace{Namespace: "default", Name: "test-worklo
3535
var _ = Describe("GPU Allocator", func() {
3636
var allocator *GpuAllocator
3737

38+
allocateAndSync := func(poolName string, request tfv1.Resource, count uint, gpuModel string) ([]*tfv1.GPU, error) {
39+
gpus, err := allocator.Alloc(ctx, poolName, workloadNameNs, request, count, gpuModel)
40+
allocator.syncToK8s(ctx)
41+
return gpus, err
42+
}
43+
44+
deallocateAndSync := func(gpus []*tfv1.GPU, request tfv1.Resource) {
45+
err := allocator.Dealloc(ctx, workloadNameNs, request, lo.Map(gpus, func(gpu *tfv1.GPU, _ int) types.NamespacedName {
46+
return client.ObjectKeyFromObject(gpu)
47+
}))
48+
Expect(err).NotTo(HaveOccurred())
49+
allocator.syncToK8s(ctx)
50+
}
51+
3852
BeforeEach(func() {
3953
allocator = NewGpuAllocator(ctx, k8sClient, 150*time.Millisecond)
4054
readyCh, err := allocator.SetupWithManager(ctx, mgr)
@@ -63,14 +77,10 @@ var _ = Describe("GPU Allocator", func() {
6377
Vram: resource.MustParse("8Gi"),
6478
}
6579

66-
gpus, err := allocator.Alloc(ctx, "test-pool", workloadNameNs, request, 1, "")
80+
gpus, err := allocateAndSync("test-pool", request, 1, "")
6781
Expect(err).NotTo(HaveOccurred())
6882
Expect(gpus).To(HaveLen(1))
6983

70-
// Explicitly call syncToK8s to persist changes before verification
71-
allocator.syncToK8s(ctx)
72-
73-
// Explicitly refresh node capacity, simulate reconcile loop of GPUNode triggered
7484
gpuNode := &tfv1.GPUNode{}
7585
if err := k8sClient.Get(ctx, types.NamespacedName{Name: gpus[0].Labels[constants.LabelKeyOwner]}, gpuNode); err != nil {
7686
Expect(err).NotTo(HaveOccurred())
@@ -82,7 +92,7 @@ var _ = Describe("GPU Allocator", func() {
8292
_, _ = RefreshGPUNodeCapacity(ctx, k8sClient, gpuNode, pool)
8393

8494
// Verify resources were reduced on the allocated GPU
85-
gpu := getGPU(gpus[0].Name, gpus[0].Namespace)
95+
gpu := getGPU(gpus[0].Name)
8696
Expect(gpu.Status.Available.Tflops.Cmp(gpu.Status.Capacity.Tflops)).To(Equal(-1))
8797
Expect(gpu.Status.Available.Vram.Cmp(gpu.Status.Capacity.Vram)).To(Equal(-1))
8898

@@ -99,7 +109,7 @@ var _ = Describe("GPU Allocator", func() {
99109
Vram: resource.MustParse("4Gi"),
100110
}
101111

102-
gpus, err := allocator.Alloc(ctx, "test-pool", workloadNameNs, request, 2, "")
112+
gpus, err := allocateAndSync("test-pool", request, 2, "")
103113
Expect(err).NotTo(HaveOccurred())
104114
Expect(gpus).To(HaveLen(2))
105115

@@ -116,7 +126,7 @@ var _ = Describe("GPU Allocator", func() {
116126
Vram: resource.MustParse("2Gi"),
117127
}
118128

119-
_, err := allocator.Alloc(ctx, "test-pool", workloadNameNs, request, 10, "")
129+
_, err := allocateAndSync("test-pool", request, 10, "")
120130
Expect(err).To(HaveOccurred())
121131
})
122132

@@ -126,7 +136,7 @@ var _ = Describe("GPU Allocator", func() {
126136
Vram: resource.MustParse("64Gi"),
127137
}
128138

129-
_, err := allocator.Alloc(ctx, "test-pool", workloadNameNs, request, 1, "")
139+
_, err := allocateAndSync("test-pool", request, 1, "")
130140
Expect(err).To(HaveOccurred())
131141
})
132142

@@ -136,7 +146,7 @@ var _ = Describe("GPU Allocator", func() {
136146
Vram: resource.MustParse("2Gi"),
137147
}
138148

139-
_, err := allocator.Alloc(ctx, "nonexistent-pool", workloadNameNs, request, 1, "")
149+
_, err := allocateAndSync("nonexistent-pool", request, 1, "")
140150
Expect(err).To(HaveOccurred())
141151
})
142152

@@ -147,13 +157,12 @@ var _ = Describe("GPU Allocator", func() {
147157
}
148158

149159
// Try allocating with a specific GPU model
150-
gpus, err := allocator.Alloc(ctx, "test-pool", workloadNameNs, request, 1, "NVIDIA A100")
160+
gpus, err := allocateAndSync("test-pool", request, 1, "NVIDIA A100")
151161
Expect(err).NotTo(HaveOccurred())
152-
Expect(gpus).To(HaveLen(1))
153162
Expect(gpus[0].Status.GPUModel).To(Equal("NVIDIA A100"))
154163

155164
// Try allocating with a non-existent GPU model
156-
_, err = allocator.Alloc(ctx, "test-pool", workloadNameNs, request, 1, "NonExistentModel")
165+
_, err = allocateAndSync("test-pool", request, 1, "NonExistentModel")
157166
Expect(err).To(HaveOccurred())
158167
})
159168
})
@@ -166,7 +175,7 @@ var _ = Describe("GPU Allocator", func() {
166175
Vram: resource.MustParse("6Gi"),
167176
}
168177

169-
gpus, err := allocator.Alloc(ctx, "test-pool", workloadNameNs, request, 1, "")
178+
gpus, err := allocateAndSync("test-pool", request, 1, "")
170179
Expect(err).NotTo(HaveOccurred())
171180
Expect(gpus).To(HaveLen(1))
172181

@@ -176,13 +185,10 @@ var _ = Describe("GPU Allocator", func() {
176185
allocatedVram := allocatedGPU.Status.Available.Vram.DeepCopy()
177186

178187
// Now deallocate
179-
err = allocator.Dealloc(ctx, workloadNameNs, request, []types.NamespacedName{client.ObjectKeyFromObject(gpus[0])})
180-
Expect(err).NotTo(HaveOccurred())
181-
182-
allocator.syncToK8s(ctx)
188+
deallocateAndSync(gpus, request)
183189

184190
// Verify resources were restored
185-
deallocatedGPU := getGPU(allocatedGPU.Name, allocatedGPU.Namespace)
191+
deallocatedGPU := getGPU(allocatedGPU.Name)
186192
expectedTflops := allocatedTflops.DeepCopy()
187193
expectedVram := allocatedVram.DeepCopy()
188194
expectedTflops.Add(request.Tflops)
@@ -201,7 +207,7 @@ var _ = Describe("GPU Allocator", func() {
201207
}
202208

203209
// Allocate 2 GPUs
204-
allocatedGPUs, err := allocator.Alloc(ctx, "test-pool", tfv1.NameNamespace{Namespace: "default", Name: "test-workload"}, request, 2, "")
210+
allocatedGPUs, err := allocateAndSync("test-pool", request, 2, "")
205211
Expect(err).NotTo(HaveOccurred())
206212
Expect(allocatedGPUs).To(HaveLen(2))
207213

@@ -230,23 +236,14 @@ var _ = Describe("GPU Allocator", func() {
230236
vram: gpu.Status.Available.Vram.DeepCopy(),
231237
}
232238
}
233-
gpusToDeallocKeys := lo.Map(gpusToDealloc, func(gpu *tfv1.GPU, _ int) types.NamespacedName {
234-
return client.ObjectKeyFromObject(gpu)
235-
})
239+
236240
// Now deallocate all GPUs including the non-existent one
237-
err = allocator.Dealloc(ctx, tfv1.NameNamespace{Namespace: "default", Name: "test-workload"}, request, gpusToDeallocKeys)
238-
Expect(err).NotTo(HaveOccurred())
241+
deallocateAndSync(gpusToDealloc, request)
239242

240243
// Verify resources were restored for existing GPUs
241244
for _, allocatedGPU := range allocatedGPUs {
242-
deallocatedGPU := getGPU(allocatedGPU.Name, allocatedGPU.Namespace)
245+
deallocatedGPU := getGPU(allocatedGPU.Name)
243246
initialState := initialStates[allocatedGPU.Name]
244-
245-
expectedTflops := initialState.tflops.DeepCopy()
246-
expectedVram := initialState.vram.DeepCopy()
247-
expectedTflops.Add(request.Tflops)
248-
expectedVram.Add(request.Vram)
249-
250247
Expect(deallocatedGPU.Status.Available.Tflops.Cmp(initialState.tflops)).To(Equal(1))
251248
Expect(deallocatedGPU.Status.Available.Vram.Cmp(initialState.vram)).To(Equal(1))
252249
}
@@ -301,7 +298,7 @@ var _ = Describe("GPU Allocator", func() {
301298
Expect(exists).To(BeTrue())
302299

303300
// Get the GPU from the API server
304-
gpuToDelete := getGPU("gpu-1", "")
301+
gpuToDelete := getGPU("gpu-1")
305302

306303
// Handle the deletion event
307304
allocator.handleGPUDelete(ctx, gpuToDelete)

internal/metrics/recorder.go

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -97,12 +97,35 @@ func SetNodeMetrics(node *tfv1.GPUNode, poolObj *tfv1.GPUPool, gpuModels []strin
9797
metricsItem.PoolName = poolObj.Name
9898
metricsItem.GPUModels = gpuModels
9999

100-
metricsItem.AllocatedTflops = node.Status.TotalTFlops.AsApproximateFloat64() - node.Status.AvailableTFlops.AsApproximateFloat64()
101-
metricsItem.AllocatedTflopsPercent = metricsItem.AllocatedTflops / node.Status.TotalTFlops.AsApproximateFloat64() * 100
102-
metricsItem.AllocatedVramBytes = node.Status.TotalVRAM.AsApproximateFloat64() - node.Status.AvailableVRAM.AsApproximateFloat64()
103-
metricsItem.AllocatedVramPercent = metricsItem.AllocatedVramBytes / node.Status.TotalVRAM.AsApproximateFloat64() * 100
104-
metricsItem.AllocatedTflopsPercentToVirtualCap = metricsItem.AllocatedTflops / node.Status.VirtualTFlops.AsApproximateFloat64() * 100
105-
metricsItem.AllocatedVramPercentToVirtualCap = metricsItem.AllocatedVramBytes / node.Status.VirtualVRAM.AsApproximateFloat64() * 100
100+
totalTflops := node.Status.TotalTFlops.AsApproximateFloat64()
101+
totalVram := node.Status.TotalVRAM.AsApproximateFloat64()
102+
103+
metricsItem.AllocatedTflops = totalTflops - node.Status.AvailableTFlops.AsApproximateFloat64()
104+
if totalTflops <= 0 {
105+
metricsItem.AllocatedTflopsPercent = 0
106+
} else {
107+
metricsItem.AllocatedTflopsPercent = metricsItem.AllocatedTflops / totalTflops * 100
108+
}
109+
110+
metricsItem.AllocatedVramBytes = totalVram - node.Status.AvailableVRAM.AsApproximateFloat64()
111+
if totalVram <= 0 {
112+
metricsItem.AllocatedVramPercent = 0
113+
} else {
114+
metricsItem.AllocatedVramPercent = metricsItem.AllocatedVramBytes / totalVram * 100
115+
}
116+
117+
totalVirtualTflops := node.Status.VirtualTFlops.AsApproximateFloat64()
118+
totalVirtualVram := node.Status.VirtualVRAM.AsApproximateFloat64()
119+
if totalVirtualTflops <= 0 {
120+
metricsItem.AllocatedTflopsPercentToVirtualCap = 0
121+
} else {
122+
metricsItem.AllocatedTflopsPercentToVirtualCap = metricsItem.AllocatedTflops / totalVirtualTflops * 100
123+
}
124+
if totalVirtualVram <= 0 {
125+
metricsItem.AllocatedVramPercentToVirtualCap = 0
126+
} else {
127+
metricsItem.AllocatedVramPercentToVirtualCap = metricsItem.AllocatedVramBytes / totalVirtualVram * 100
128+
}
106129
}
107130

108131
// Start metrics recorder

0 commit comments

Comments
 (0)