Revert "feat: add GPUCount field to TensorFusionWorkload and WorkloadProfile …" (#197)

0x5457 · web-flow · commit 80e65c802211 · 2025-05-26T19:57:22.000+08:00
This reverts commit f58be3c.
diff --git a/api/v1/workloadprofile_types.go b/api/v1/workloadprofile_types.go
@@ -39,8 +39,8 @@ type WorkloadProfileSpec struct {
 	PoolName string `json:"poolName,omitempty"`
 
 	// +optional
-
 	Resources Resources `json:"resources,omitempty"`
+
 	// +optional
 	// Qos defines the quality of service level for the client.
 	Qos QoSLevel `json:"qos,omitempty"`
@@ -50,8 +50,9 @@ type WorkloadProfileSpec struct {
 	IsLocalGPU bool `json:"isLocalGPU,omitempty"`
 
 	// +optional
+	// TODO, not implemented
 	// The number of GPUs to be used by the workload, default to 1
-	GPUCount uint `json:"gpuCount,omitempty"`
+	GPUCount int `json:"gpuCount,omitempty"`
 
 	// +optional
 	// TODO, not implemented
diff --git a/internal/controller/tensorfusionworkload_controller.go b/internal/controller/tensorfusionworkload_controller.go
@@ -20,13 +20,11 @@ import (
 	"context"
 	"fmt"
 	"sort"
-	"strings"
 
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/equality"
 	"k8s.io/apimachinery/pkg/api/errors"
 	"k8s.io/apimachinery/pkg/runtime"
-	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/client-go/tools/record"
 	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/client"
@@ -219,12 +217,12 @@ func (r *TensorFusionWorkloadReconciler) Reconcile(ctx context.Context, req ctrl
 func (r *TensorFusionWorkloadReconciler) tryStartWorker(
 	ctx context.Context,
 	workerGenerator *worker.WorkerGenerator,
-	gpus []*tfv1.GPU,
+	gpu *tfv1.GPU,
 	workload *tfv1.TensorFusionWorkload,
 	hash string,
 ) (*corev1.Pod, error) {
 	port := workerGenerator.AllocPort()
-	pod, hash, err := workerGenerator.GenerateWorkerPod(gpus, fmt.Sprintf("%s-tf-worker-", workload.Name), workload.Namespace, port, workload.Spec.Resources.Limits, hash)
+	pod, hash, err := workerGenerator.GenerateWorkerPod(gpu, fmt.Sprintf("%s-tf-worker-", workload.Name), workload.Namespace, port, workload.Spec.Resources.Limits, hash)
 	if err != nil {
 		return nil, fmt.Errorf("generate worker pod %w", err)
 	}
@@ -233,18 +231,9 @@ func (r *TensorFusionWorkloadReconciler) tryStartWorker(
 	if pod.Labels == nil {
 		pod.Labels = make(map[string]string)
 	}
-
-	if pod.Annotations == nil {
-		pod.Annotations = make(map[string]string)
-	}
-
-	gpuNames := lo.Map(gpus, func(gpu *tfv1.GPU, _ int) string {
-		return gpu.Name
-	})
-
 	pod.Labels[constants.WorkloadKey] = workload.Name
+	pod.Labels[constants.GpuKey] = gpu.Name
 	pod.Labels[constants.LabelKeyPodTemplateHash] = hash
-	pod.Annotations[constants.GpuKey] = strings.Join(gpuNames, ",")
 
 	// Add finalizer for GPU resource cleanup
 	pod.Finalizers = append(pod.Finalizers, constants.Finalizer)
@@ -280,7 +269,6 @@ func (r *TensorFusionWorkloadReconciler) scaleDownWorkers(ctx context.Context, w
 		metrics.GpuTflopsLimit.Delete(labels)
 		metrics.VramBytesRequest.Delete(labels)
 		metrics.VramBytesLimit.Delete(labels)
-		metrics.GpuCount.Delete(labels)
 	}
 	return nil
 }
@@ -291,24 +279,26 @@ func (r *TensorFusionWorkloadReconciler) handlePodGPUCleanup(ctx context.Context
 
 	log.Info("Processing pod with GPU resource cleanup finalizer", "pod", pod.Name)
 
-	// read the GPU names from the pod annotations
-	gpuNamesStr, ok := pod.Annotations[constants.GpuKey]
+	// Get GPU name from pod label
+	gpuName, ok := pod.Labels[constants.GpuKey]
 	if !ok {
 		log.Info("Pod has finalizer but no GPU label", "pod", pod.Name)
 		return true, nil
 	}
 
-	// Split GPU names by comma
-	gpuNames := strings.Split(gpuNamesStr, ",")
-	gpus := lo.Map(gpuNames, func(gpuName string, _ int) types.NamespacedName {
-		return types.NamespacedName{Name: gpuName}
-	})
-	// Release GPU resources
-	if err := r.Allocator.Dealloc(ctx, workload.Spec.Resources.Requests, gpus); err != nil {
-		log.Error(err, "Failed to release GPU resources, will retry", "gpus", gpus, "pod", pod.Name)
+	// Get the GPU
+	gpu := &tfv1.GPU{}
+	if err := r.Get(ctx, client.ObjectKey{Name: gpuName}, gpu); err != nil {
+		if errors.IsNotFound(err) {
+			// GPU not found, just continue
+			log.Info("GPU not found", "gpu", gpuName, "pod", pod.Name)
+			return true, nil
+		}
+		// Error getting GPU, retry later
+		log.Error(err, "Failed to get GPU", "gpu", gpuName, "pod", pod.Name)
 		return false, err
 	}
-	log.Info("Released GPU resources via finalizer", "gpus", gpus, "pod", pod.Name)
+
 	if pod.Annotations == nil {
 		pod.Annotations = make(map[string]string)
 	}
@@ -320,10 +310,17 @@ func (r *TensorFusionWorkloadReconciler) handlePodGPUCleanup(ctx context.Context
 	// not yet reflecting the finalizer's removal), Then this r.Update pod will fail.
 	// Will not cause duplicate releases
 	if err := r.Update(ctx, pod); err != nil {
-		log.Error(err, "Failed to mark that GPU cleanup of pod")
+		log.Error(err, "Failed to mark that GPU cleanup of pod", "gpu", gpuName, "pod", pod.Name)
+		return false, err
+	}
+
+	// Release GPU resources
+	if err := r.Allocator.Dealloc(ctx, workload.Spec.Resources.Requests, gpu); err != nil {
+		log.Error(err, "Failed to release GPU resources, will retry", "gpu", gpuName, "pod", pod.Name)
 		return false, err
 	}
 
+	log.Info("Released GPU resources via finalizer", "gpu", gpuName, "pod", pod.Name)
 	return true, nil
 }
 
@@ -347,21 +344,21 @@ func (r *TensorFusionWorkloadReconciler) scaleUpWorkers(ctx context.Context, wor
 	// Create worker pods
 	for range count {
 		// Schedule GPU for the worker
-		gpus, err := r.Allocator.Alloc(ctx, workload.Spec.PoolName, workload.Spec.Resources.Requests, workload.Spec.GPUCount)
+		gpus, err := r.Allocator.Alloc(ctx, workload.Spec.PoolName, workload.Spec.Resources.Requests, 1)
 		if err != nil {
 			r.Recorder.Eventf(workload, corev1.EventTypeWarning, "ScheduleGPUFailed", "Failed to schedule GPU: %v", err)
 			return ctrl.Result{RequeueAfter: constants.PendingRequeueDuration}, nil
 		}
 
-		pod, err := r.tryStartWorker(ctx, workerGenerator, gpus, workload, hash)
+		// Use the first GPU from the allocated array
+		gpu := gpus[0]
+
+		pod, err := r.tryStartWorker(ctx, workerGenerator, gpu, workload, hash)
 		if err != nil {
-			// Try to release all allocated GPUs if pod creation fails
-			gpus := lo.Map(gpus, func(gpu *tfv1.GPU, _ int) types.NamespacedName {
-				return client.ObjectKeyFromObject(gpu)
-			})
-			releaseErr := r.Allocator.Dealloc(ctx, workload.Spec.Resources.Requests, gpus)
+			// Try to release the GPU resource if pod creation fails
+			releaseErr := r.Allocator.Dealloc(ctx, workload.Spec.Resources.Requests, gpu)
 			if releaseErr != nil {
-				log.Error(releaseErr, "Failed to release GPU after pod creation failure", "gpus", gpus)
+				log.Error(releaseErr, "Failed to release GPU after pod creation failure")
 			}
 			return ctrl.Result{}, fmt.Errorf("create worker pod: %w", err)
 		}
@@ -375,7 +372,6 @@ func (r *TensorFusionWorkloadReconciler) scaleUpWorkers(ctx context.Context, wor
 		metrics.GpuTflopsLimit.With(labels).Set(workload.Spec.Resources.Limits.Tflops.AsApproximateFloat64())
 		metrics.VramBytesRequest.With(labels).Set(workload.Spec.Resources.Requests.Vram.AsApproximateFloat64())
 		metrics.VramBytesLimit.With(labels).Set(workload.Spec.Resources.Limits.Vram.AsApproximateFloat64())
-		metrics.GpuCount.With(labels).Set(float64(workload.Spec.GPUCount))
 	}
 
 	return ctrl.Result{}, nil
diff --git a/internal/controller/tensorfusionworkload_controller_test.go b/internal/controller/tensorfusionworkload_controller_test.go
@@ -17,7 +17,6 @@ limitations under the License.
 package controller
 
 import (
-	"strings"
 	"time"
 
 	"github.com/aws/smithy-go/ptr"
@@ -58,51 +57,6 @@ var _ = Describe("TensorFusionWorkload Controller", func() {
 			checkWorkerPodCount(workload)
 			checkWorkloadStatus(workload)
 		})
-
-		It("Should allocate multiple GPUs per workload when GPUCount > 1", func() {
-			pool := tfEnv.GetGPUPool(0)
-			By("creating a workload that requests 2 GPUs")
-			workload := &tfv1.TensorFusionWorkload{
-				ObjectMeta: metav1.ObjectMeta{
-					Name:      key.Name,
-					Namespace: key.Namespace,
-					Labels: map[string]string{
-						constants.LabelKeyOwner: pool.Name,
-					},
-				},
-				Spec: tfv1.WorkloadProfileSpec{
-					Replicas: ptr.Int32(1),
-					PoolName: pool.Name,
-					GPUCount: 2,
-					Resources: tfv1.Resources{
-						Requests: tfv1.Resource{
-							Tflops: resource.MustParse("10"),
-							Vram:   resource.MustParse("8Gi"),
-						},
-						Limits: tfv1.Resource{
-							Tflops: resource.MustParse("20"),
-							Vram:   resource.MustParse("16Gi"),
-						},
-					},
-				},
-			}
-
-			Expect(k8sClient.Create(ctx, workload)).To(Succeed())
-
-			// Check that pod is created with 2 GPUs
-			podList := &corev1.PodList{}
-			Eventually(func(g Gomega) {
-				g.Expect(k8sClient.List(ctx, podList,
-					client.InNamespace(key.Namespace),
-					client.MatchingLabels{constants.WorkloadKey: key.Name})).Should(Succeed())
-				g.Expect(podList.Items).Should(HaveLen(1))
-
-				gpuNames := strings.Split(podList.Items[0].Annotations[constants.GpuKey], ",")
-				g.Expect(gpuNames).Should(HaveLen(2))
-			}, timeout, interval).Should(Succeed())
-
-			checkWorkloadStatus(workload)
-		})
 	})
 
 	Context("When scaling up a workload", func() {
diff --git a/internal/gpuallocator/gpuallocator.go b/internal/gpuallocator/gpuallocator.go
@@ -121,26 +121,25 @@ func (s *GpuAllocator) Alloc(
 	return result, nil
 }
 
-// Dealloc deallocates a request from one or multiple gpus.
-func (s *GpuAllocator) Dealloc(ctx context.Context, request tfv1.Resource, gpus []types.NamespacedName) error {
+// Dealloc deallocates a request from a gpu.
+func (s *GpuAllocator) Dealloc(ctx context.Context, request tfv1.Resource, gpu *tfv1.GPU) error {
 	log := log.FromContext(ctx)
 	s.storeMutex.Lock()
 	defer s.storeMutex.Unlock()
 
-	for _, gpu := range gpus {
-		// Get the GPU from the store
-		storeGPU, exists := s.gpuStore[gpu]
-		if !exists {
-			log.Error(fmt.Errorf("GPU not found in store"), "Failed to deallocate GPU", "name", gpu.String())
-			continue
-		}
+	// Get the GPU from the store
+	key := types.NamespacedName{Name: gpu.Name, Namespace: gpu.Namespace}
+	storeGPU, exists := s.gpuStore[key]
+	if !exists {
+		log.Info("GPU not found in store during deallocation", "name", key.String())
+		return fmt.Errorf("GPU %s not found in store", key.String())
+	}
 
-		// Add resources back to the GPU
-		storeGPU.Status.Available.Tflops.Add(request.Tflops)
-		storeGPU.Status.Available.Vram.Add(request.Vram)
+	// Add resources back to the GPU
+	storeGPU.Status.Available.Tflops.Add(request.Tflops)
+	storeGPU.Status.Available.Vram.Add(request.Vram)
 
-		s.markGPUDirty(gpu)
-	}
+	s.markGPUDirty(key)
 
 	return nil
 }
diff --git a/internal/gpuallocator/gpuallocator_test.go b/internal/gpuallocator/gpuallocator_test.go
@@ -23,11 +23,9 @@ import (
 	"github.com/NexusGPU/tensor-fusion/internal/constants"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
-	"github.com/samber/lo"
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/types"
-	"sigs.k8s.io/controller-runtime/pkg/client"
 )
 
 var _ = Describe("GPU Allocator", func() {
@@ -140,7 +138,7 @@ var _ = Describe("GPU Allocator", func() {
 			allocatedVram := allocatedGPU.Status.Available.Vram.DeepCopy()
 
 			// Now deallocate
-			err = allocator.Dealloc(ctx, request, []types.NamespacedName{client.ObjectKeyFromObject(gpus[0])})
+			err = allocator.Dealloc(ctx, request, allocatedGPU)
 			Expect(err).NotTo(HaveOccurred())
 
 			// Verify resources were restored
@@ -150,69 +148,9 @@ var _ = Describe("GPU Allocator", func() {
 			expectedTflops.Add(request.Tflops)
 			expectedVram.Add(request.Vram)
 
-			Expect(deallocatedGPU.Status.Available.Tflops.Cmp(expectedTflops)).To(Equal(0))
-			Expect(deallocatedGPU.Status.Available.Vram.Cmp(expectedVram)).To(Equal(0))
+			Expect(deallocatedGPU.Status.Available.Tflops.Cmp(allocatedTflops)).To(Equal(1))
 			Expect(deallocatedGPU.Status.Available.Vram.Cmp(allocatedVram)).To(Equal(1))
 		})
-
-		It("should continue deallocating when some GPUs don't exist", func() {
-			// First allocate resources to multiple GPUs
-			request := tfv1.Resource{
-				Tflops: resource.MustParse("20"),
-				Vram:   resource.MustParse("4Gi"),
-			}
-
-			// Allocate 2 GPUs
-			allocatedGPUs, err := allocator.Alloc(ctx, "test-pool", request, 2)
-			Expect(err).NotTo(HaveOccurred())
-			Expect(allocatedGPUs).To(HaveLen(2))
-
-			// Create a non-existent GPU
-			nonExistentGPU := &tfv1.GPU{
-				ObjectMeta: metav1.ObjectMeta{
-					Name:      "non-existent-gpu",
-					Namespace: "default",
-				},
-			}
-
-			// Add the non-existent GPU to the list
-			gpusToDealloc := append(allocatedGPUs, nonExistentGPU)
-
-			// Store the allocated values for existing GPUs
-			initialStates := make(map[string]struct {
-				tflops resource.Quantity
-				vram   resource.Quantity
-			})
-			for _, gpu := range allocatedGPUs {
-				initialStates[gpu.Name] = struct {
-					tflops resource.Quantity
-					vram   resource.Quantity
-				}{
-					tflops: gpu.Status.Available.Tflops.DeepCopy(),
-					vram:   gpu.Status.Available.Vram.DeepCopy(),
-				}
-			}
-			gpusToDeallocKeys := lo.Map(gpusToDealloc, func(gpu *tfv1.GPU, _ int) types.NamespacedName {
-				return client.ObjectKeyFromObject(gpu)
-			})
-			// Now deallocate all GPUs including the non-existent one
-			err = allocator.Dealloc(ctx, request, gpusToDeallocKeys)
-			Expect(err).NotTo(HaveOccurred())
-
-			// Verify resources were restored for existing GPUs
-			for _, allocatedGPU := range allocatedGPUs {
-				deallocatedGPU := getGPU(allocatedGPU.Name, allocatedGPU.Namespace)
-				initialState := initialStates[allocatedGPU.Name]
-
-				expectedTflops := initialState.tflops.DeepCopy()
-				expectedVram := initialState.vram.DeepCopy()
-				expectedTflops.Add(request.Tflops)
-				expectedVram.Add(request.Vram)
-
-				Expect(deallocatedGPU.Status.Available.Tflops.Cmp(initialState.tflops)).To(Equal(1))
-				Expect(deallocatedGPU.Status.Available.Vram.Cmp(initialState.vram)).To(Equal(1))
-			}
-		})
 	})
 
 	Context("Event Handling", func() {
diff --git a/internal/metrics/worker.go b/internal/metrics/worker.go
@@ -42,14 +42,6 @@ var (
 		labels,
 	)
 
-	GpuCount = prometheus.NewGaugeVec(
-		prometheus.GaugeOpts{
-			Name: "gpu_count",
-			Help: "Number of GPUs allocated to the workload",
-		},
-		labels,
-	)
-
 	AllocatedTflopsPercent = prometheus.NewGaugeVec(
 		prometheus.GaugeOpts{
 			Name: "allocated_compute_percentage",
@@ -66,13 +58,5 @@ var (
 )
 
 func init() {
-	metrics.Registry.MustRegister(
-		GpuTflopsRequest,
-		GpuTflopsLimit,
-		VramBytesRequest,
-		VramBytesLimit,
-		AllocatedTflopsPercent,
-		AllocatedVramBytes,
-		GpuCount,
-	)
+	metrics.Registry.MustRegister(GpuTflopsRequest, GpuTflopsLimit, VramBytesRequest, VramBytesLimit)
 }
diff --git a/internal/webhook/v1/pod_webhook.go b/internal/webhook/v1/pod_webhook.go
diff --git a/internal/webhook/v1/tf_parser.go b/internal/webhook/v1/tf_parser.go
diff --git a/internal/worker/worker.go b/internal/worker/worker.go