fix: unit test issues, workload count bug

Code2Life · Code2Life · commit 00a57cc803a0 · 2025-06-03T22:39:56.000+08:00
diff --git a/Makefile b/Makefile
@@ -62,7 +62,7 @@ vet: ## Run go vet against code.
 
 .PHONY: test
 test: manifests generate fmt vet envtest ## Run tests.
-	KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" GO_TESTING=true ginkgo --p -timeout 0 -cover -coverprofile cover.out -r --skip-file ./test/e2e
+	KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" GO_TESTING=true go run github.com/onsi/ginkgo/v2/ginkgo -p -timeout 0 -cover -coverprofile cover.out -r --skip-file ./test/e2e
 
 .PHONY: test-e2e
 test-e2e: manifests generate fmt vet ## Run the e2e tests. Expected an isolated environment using Kind.
diff --git a/internal/controller/suite_test.go b/internal/controller/suite_test.go
@@ -35,6 +35,7 @@ import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/client-go/kubernetes/scheme"
 	"k8s.io/client-go/rest"
+	"k8s.io/client-go/util/retry"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
 	"sigs.k8s.io/controller-runtime/pkg/envtest"
@@ -253,7 +254,15 @@ func (c *TensorFusionEnv) GetCluster() *tfv1.TensorFusionCluster {
 
 func (c *TensorFusionEnv) UpdateCluster(tfc *tfv1.TensorFusionCluster) {
 	GinkgoHelper()
-	Expect(k8sClient.Update(ctx, tfc)).Should(Succeed())
+	err := retry.RetryOnConflict(retry.DefaultBackoff, func() error {
+		latest := &tfv1.TensorFusionCluster{}
+		if err := k8sClient.Get(ctx, client.ObjectKeyFromObject(tfc), latest); err != nil {
+			return err
+		}
+		latest.Spec = tfc.Spec
+		return k8sClient.Update(ctx, latest)
+	})
+	Expect(err).Should(Succeed())
 }
 
 func (c *TensorFusionEnv) Cleanup() {
diff --git a/internal/controller/tensorfusioncluster_controller.go b/internal/controller/tensorfusioncluster_controller.go
@@ -432,16 +432,16 @@ func (r *TensorFusionClusterReconciler) updateMetricsRecorder(ctx context.Contex
 	}
 	pricingDetail := r.MetricsRecorder.WorkerUnitPriceMap[pool.Name]
 	for _, pricing := range qosConfig.Pricing {
-		tflopsPerSecond, _ := strconv.ParseFloat(pricing.Requests.PerFP16TFlopsPerHour, 64)
-		vramPerSecond, _ := strconv.ParseFloat(pricing.Requests.PerGBOfVRAMPerHour, 64)
+		tflopsPerHour, _ := strconv.ParseFloat(pricing.Requests.PerFP16TFlopsPerHour, 64)
+		vramPerHour, _ := strconv.ParseFloat(pricing.Requests.PerGBOfVRAMPerHour, 64)
 		limitOverRequestChargingRatio, _ := strconv.ParseFloat(pricing.LimitsOverRequestsChargingRatio, 64)
 
 		pricingDetail[string(pricing.Qos)] = metrics.RawBillingPricing{
-			TflopsPerSecond: tflopsPerSecond / 3600,
-			VramPerSecond:   vramPerSecond / 3600,
+			TflopsPerSecond: tflopsPerHour / float64(3600),
+			VramPerSecond:   vramPerHour / float64(3600),
 
-			TflopsOverRequestPerSecond: tflopsPerSecond / 3600 * limitOverRequestChargingRatio,
-			VramOverRequestPerSecond:   vramPerSecond / 3600 * limitOverRequestChargingRatio,
+			TflopsOverRequestPerSecond: tflopsPerHour / float64(3600) * limitOverRequestChargingRatio,
+			VramOverRequestPerSecond:   vramPerHour / float64(3600) * limitOverRequestChargingRatio,
 		}
 	}
 
diff --git a/internal/controller/tensorfusionworkload_controller.go b/internal/controller/tensorfusionworkload_controller.go
@@ -269,6 +269,9 @@ func (r *TensorFusionWorkloadReconciler) tryStartWorker(
 	workload *tfv1.TensorFusionWorkload,
 	hash string,
 ) (*corev1.Pod, error) {
+	if len(gpus) == 0 || gpus[0].Labels == nil {
+		return nil, fmt.Errorf("no gpus or no labels, can not assign host port for worker")
+	}
 	port, err := r.PortAllocator.AssignHostPort(gpus[0].Status.NodeSelector[constants.KubernetesHostNameLabel])
 	if err != nil {
 		return nil, fmt.Errorf("get host port %w", err)
diff --git a/internal/gpuallocator/gpuallocator.go b/internal/gpuallocator/gpuallocator.go
@@ -108,7 +108,9 @@ func (s *GpuAllocator) Alloc(
 	s.storeMutex.Lock()
 	defer s.storeMutex.Unlock()
 
+	appAdded := false
 	for _, selectedGPU := range selectedGPUs {
+
 		// Get the GPU from the store
 		key := types.NamespacedName{Name: selectedGPU.Name, Namespace: selectedGPU.Namespace}
 		gpu, exists := s.gpuStore[key]
@@ -122,7 +124,10 @@ func (s *GpuAllocator) Alloc(
 		gpu.Status.Available.Tflops.Sub(request.Tflops)
 		gpu.Status.Available.Vram.Sub(request.Vram)
 
-		addRunningApp(gpu, workloadNameNamespace)
+		if !appAdded {
+			addRunningApp(ctx, gpu, workloadNameNamespace)
+			appAdded = true
+		}
 
 		s.markGPUDirty(key)
 	}
@@ -143,6 +148,7 @@ func (s *GpuAllocator) Dealloc(ctx context.Context, workloadNameNamespace tfv1.N
 	s.storeMutex.Lock()
 	defer s.storeMutex.Unlock()
 
+	appRemoved := false
 	for _, gpu := range gpus {
 		// Get the GPU from the store
 		storeGPU, exists := s.gpuStore[gpu]
@@ -154,6 +160,10 @@ func (s *GpuAllocator) Dealloc(ctx context.Context, workloadNameNamespace tfv1.N
 		// Add resources back to the GPU
 		storeGPU.Status.Available.Tflops.Add(request.Tflops)
 		storeGPU.Status.Available.Vram.Add(request.Vram)
+		if !appRemoved {
+			removeRunningApp(ctx, storeGPU, workloadNameNamespace)
+			appRemoved = true
+		}
 
 		s.markGPUDirty(gpu)
 	}
@@ -468,11 +478,14 @@ func (s *GpuAllocator) reconcileAllocationState(ctx context.Context) {
 
 	tflopsCapacityMap := make(map[types.NamespacedName]resource.Quantity)
 	vramCapacityMap := make(map[types.NamespacedName]resource.Quantity)
+	gpuMap := make(map[types.NamespacedName]*tfv1.GPU)
 
 	for gpuKey, gpu := range s.gpuStore {
 		if gpu.Status.Capacity != nil {
 			tflopsCapacityMap[gpuKey] = gpu.Status.Capacity.Tflops
 			vramCapacityMap[gpuKey] = gpu.Status.Capacity.Vram
+			gpu.Status.RunningApps = []*tfv1.RunningAppDetail{}
+			gpuMap[gpuKey] = gpu
 		}
 	}
 
@@ -481,6 +494,7 @@ func (s *GpuAllocator) reconcileAllocationState(ctx context.Context) {
 		vramRequest, _ := resource.ParseQuantity(worker.Annotations[constants.VRAMRequestAnnotation])
 		gpuIds := worker.Annotations[constants.GpuKey]
 		gpuIdsList := strings.Split(gpuIds, ",")
+		appAdded := false
 		for _, gpuId := range gpuIdsList {
 			gpuKey := types.NamespacedName{Name: gpuId}
 			gpuCapacity, ok := tflopsCapacityMap[gpuKey]
@@ -491,6 +505,10 @@ func (s *GpuAllocator) reconcileAllocationState(ctx context.Context) {
 			if ok {
 				gpuCapacity.Sub(vramRequest)
 			}
+			if !appAdded {
+				addRunningApp(ctx, gpuMap[gpuKey], tfv1.NameNamespace{Namespace: worker.Namespace, Name: worker.Labels[constants.WorkloadKey]})
+				appAdded = true
+			}
 		}
 	}
 
@@ -510,7 +528,11 @@ func (s *GpuAllocator) reconcileAllocationState(ctx context.Context) {
 	}
 }
 
-func addRunningApp(gpu *tfv1.GPU, workloadNameNamespace tfv1.NameNamespace) {
+func addRunningApp(ctx context.Context, gpu *tfv1.GPU, workloadNameNamespace tfv1.NameNamespace) {
+	if gpu == nil {
+		log.FromContext(ctx).Info("[Warning] GPU is nil, skip adding running app", "workload", workloadNameNamespace.Name, "namespace", workloadNameNamespace.Namespace)
+		return
+	}
 	if gpu.Status.RunningApps == nil {
 		gpu.Status.RunningApps = []*tfv1.RunningAppDetail{}
 	}