fix: port allocator issues

Code2Life · Code2Life · commit a52074bbb922 · 2025-05-29T23:46:06.000+08:00
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -65,6 +65,7 @@
         "schedulingconfigtemplates",
         "schedulingcorev",
         "shirou",
+        "strategicpatches",
         "subresource",
         "tensorfusion",
         "tensorfusionaiv",
diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_gpupools.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_gpupools.yaml
@@ -546,7 +546,8 @@ spec:
                           description: Default requests and limitsOverRequests are
                             same, indicates normal on-demand serverless GPU usage,
                             in hands-on lab low QoS case, limitsOverRequests should
-                            be cheaper, for example Low QoS, ratio should be 0.5
+                            be lower, so that user can get burstable GPU resources
+                            with very low cost
                           type: string
                         qos:
                           enum:
diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionclusters.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionclusters.yaml
@@ -650,8 +650,8 @@ spec:
                                     description: Default requests and limitsOverRequests
                                       are same, indicates normal on-demand serverless
                                       GPU usage, in hands-on lab low QoS case, limitsOverRequests
-                                      should be cheaper, for example Low QoS, ratio
-                                      should be 0.5
+                                      should be lower, so that user can get burstable
+                                      GPU resources with very low cost
                                     type: string
                                   qos:
                                     enum:
diff --git a/cmd/main.go b/cmd/main.go
@@ -92,8 +92,11 @@ func main() {
 	flag.StringVar(&gpuInfoConfig, "gpu-info-config",
 		"/etc/tensor-fusion/gpu-info.yaml", "specify the path to gpuInfoConfig file")
 	flag.StringVar(&metricsPath, "metrics-path", "/logs/metrics.log", "specify the path to metrics file")
-	flag.StringVar(&nodeLevelPortRange, "host-port-range", "40000-42000", "specify the port range for assigning ports to pre-scheduled Pods such as vGPU workers")
-	flag.StringVar(&clusterLevelPortRange, "cluster-host-port-range", "42000-62000", "specify the port range for assigning ports to random Pods marked with `tensor-fusion.ai/host-port: auto` and `tensor-fusion.ai/port-name: ssh`")
+	flag.StringVar(&nodeLevelPortRange, "host-port-range", "40000-42000",
+		"specify the port range for assigning ports to pre-scheduled Pods such as vGPU workers")
+	flag.StringVar(&clusterLevelPortRange, "cluster-host-port-range", "42000-62000",
+		"specify the port range for assigning ports to random Pods"+
+			" marked with `tensor-fusion.ai/host-port: auto` and `tensor-fusion.ai/port-name: ssh`")
 	opts := zap.Options{
 		Development: true,
 	}
@@ -192,14 +195,8 @@ func main() {
 		// Key is poolName, second level key is QoS level
 		WorkerUnitPriceMap: make(map[string]map[string]metrics.RawBillingPricing),
 	}
-	if enableLeaderElection {
-		go func() {
-			<-mgr.Elected()
-			metricsRecorder.Start()
-		}()
-	} else {
-		go metricsRecorder.Start()
-	}
+
+	startMetricsRecorder(enableLeaderElection, mgr, metricsRecorder)
 
 	// Initialize GPU allocator and set up watches
 	allocator := gpuallocator.NewGpuAllocator(ctx, mgr.GetClient(), 10*time.Second)
@@ -235,7 +232,7 @@ func main() {
 
 	// nolint:goconst
 	if os.Getenv("ENABLE_WEBHOOKS") != "false" {
-		if err = webhookcorev1.SetupPodWebhookWithManager(mgr); err != nil {
+		if err = webhookcorev1.SetupPodWebhookWithManager(mgr, portAllocator); err != nil {
 			setupLog.Error(err, "unable to create webhook", "webhook", "Pod")
 			os.Exit(1)
 		}
@@ -291,8 +288,9 @@ func main() {
 		os.Exit(1)
 	}
 	if err = (&controller.PodReconciler{
-		Client: mgr.GetClient(),
-		Scheme: mgr.GetScheme(),
+		Client:        mgr.GetClient(),
+		Scheme:        mgr.GetScheme(),
+		PortAllocator: portAllocator,
 	}).SetupWithManager(mgr); err != nil {
 		setupLog.Error(err, "unable to create controller", "controller", "Pod")
 		os.Exit(1)
@@ -376,6 +374,17 @@ func main() {
 	}
 }
 
+func startMetricsRecorder(enableLeaderElection bool, mgr manager.Manager, metricsRecorder metrics.MetricsRecorder) {
+	if enableLeaderElection {
+		go func() {
+			<-mgr.Elected()
+			metricsRecorder.Start()
+		}()
+	} else {
+		go metricsRecorder.Start()
+	}
+}
+
 func watchGPUInfoChanges(gpuInfoConfig string, gpuInfos *[]config.GpuInfo, gpuPricingMap map[string]float64) {
 	var lastModTime time.Time
 	if fileInfo, err := os.Stat(gpuInfoConfig); err == nil {
diff --git a/config/crd/bases/tensor-fusion.ai_gpupools.yaml b/config/crd/bases/tensor-fusion.ai_gpupools.yaml
@@ -546,7 +546,8 @@ spec:
                           description: Default requests and limitsOverRequests are
                             same, indicates normal on-demand serverless GPU usage,
                             in hands-on lab low QoS case, limitsOverRequests should
-                            be cheaper, for example Low QoS, ratio should be 0.5
+                            be lower, so that user can get burstable GPU resources
+                            with very low cost
                           type: string
                         qos:
                           enum:
diff --git a/config/crd/bases/tensor-fusion.ai_tensorfusionclusters.yaml b/config/crd/bases/tensor-fusion.ai_tensorfusionclusters.yaml
@@ -650,8 +650,8 @@ spec:
                                     description: Default requests and limitsOverRequests
                                       are same, indicates normal on-demand serverless
                                       GPU usage, in hands-on lab low QoS case, limitsOverRequests
-                                      should be cheaper, for example Low QoS, ratio
-                                      should be 0.5
+                                      should be lower, so that user can get burstable
+                                      GPU resources with very low cost
                                     type: string
                                   qos:
                                     enum:
diff --git a/internal/cloudprovider/common/utils.go b/internal/cloudprovider/common/utils.go
@@ -8,9 +8,10 @@ import (
 	"strings"
 	"time"
 
+	"math/rand"
+
 	tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
 	"github.com/NexusGPU/tensor-fusion/internal/cloudprovider/types"
-	"golang.org/x/exp/rand"
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
 	"sigs.k8s.io/controller-runtime/pkg/log"
@@ -205,7 +206,8 @@ func contains(slice []string, item string) bool {
 
 func generateRandomString(length int) string {
 	const charset = "abcdefghijklmnopqrstuvwxyz"
-	rand.Seed(uint64(time.Now().UnixNano()))
+	source := rand.NewSource(time.Now().UnixNano())
+	rand := rand.New(source)
 
 	result := make([]byte, length)
 	for i := range result {
diff --git a/internal/controller/pod_controller.go b/internal/controller/pod_controller.go
@@ -19,9 +19,11 @@ package controller
 import (
 	"context"
 	"fmt"
+	"strconv"
 
 	tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
 	"github.com/NexusGPU/tensor-fusion/internal/constants"
+	"github.com/NexusGPU/tensor-fusion/internal/portallocator"
 	"github.com/NexusGPU/tensor-fusion/internal/utils"
 	v1 "github.com/NexusGPU/tensor-fusion/internal/webhook/v1"
 	"github.com/samber/lo"
@@ -40,7 +42,8 @@ import (
 // PodReconciler reconciles a Pod object
 type PodReconciler struct {
 	client.Client
-	Scheme *runtime.Scheme
+	Scheme        *runtime.Scheme
+	PortAllocator *portallocator.PortAllocator
 }
 
 // +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch;create;update;patch;delete;deletecollection
@@ -59,6 +62,15 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R
 		log.Error(err, "Failed to get Pod")
 		return ctrl.Result{}, err
 	}
+
+	// Release cluster level port when Pod deleted
+	if !pod.DeletionTimestamp.IsZero() {
+		if pod.Annotations[constants.GenHostPortLabel] == constants.GenHostPortLabelValue {
+			podPortNumber, _ := strconv.Atoi(pod.Annotations[constants.GenPortNumberAnnotation])
+			_ = r.PortAllocator.ReleaseClusterLevelHostPort(pod.Name, podPortNumber)
+			log.Info("Released port", "pod", pod.Name, "port", podPortNumber)
+		}
+	}
 	// generate tensor fusion connections and apply to cluster
 	tfConnection := generateTensorFusionConnection(pod)
 	if tfConnection == nil {
diff --git a/internal/controller/tensorfusioncluster_controller.go b/internal/controller/tensorfusioncluster_controller.go
@@ -420,6 +420,7 @@ func (r *TensorFusionClusterReconciler) SetupWithManager(mgr ctrl.Manager) error
 
 // Update metrics recorder's raw billing map
 func (r *TensorFusionClusterReconciler) updateMetricsRecorder(ctx context.Context, pool *tfv1.GPUPool) {
+	log := log.FromContext(ctx)
 	qosConfig := pool.Spec.QosConfig
 	if _, ok := r.MetricsRecorder.WorkerUnitPriceMap[pool.Name]; !ok {
 		r.MetricsRecorder.WorkerUnitPriceMap[pool.Name] = make(map[string]metrics.RawBillingPricing)
@@ -438,4 +439,6 @@ func (r *TensorFusionClusterReconciler) updateMetricsRecorder(ctx context.Contex
 			VramOverRequestPerSecond:   vramPerSecond / 3600 * limitOverRequestChargingRatio,
 		}
 	}
+
+	log.V(5).Info("Updated metrics recorder", "pool", pool.Name, "pricing", pricingDetail)
 }
diff --git a/internal/controller/tensorfusionworkload_controller.go b/internal/controller/tensorfusionworkload_controller.go
@@ -20,6 +20,7 @@ import (
 	"context"
 	"fmt"
 	"sort"
+	"strconv"
 	"time"
 
 	corev1 "k8s.io/api/core/v1"
@@ -58,6 +59,8 @@ type TensorFusionWorkloadReconciler struct {
 // +kubebuilder:rbac:groups=tensor-fusion.ai,resources=tensorfusionworkloads/finalizers,verbs=update
 
 // TensorFusionWorkload Reconciler
+//
+//nolint:gocyclo
 func (r *TensorFusionWorkloadReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
 	log := log.FromContext(ctx)
 	log.Info("Reconciling TensorFusionWorkload", "request", req)
@@ -110,6 +113,8 @@ func (r *TensorFusionWorkloadReconciler) Reconcile(ctx context.Context, req ctrl
 
 		if deleted {
 			metrics.RemoveWorkerMetrics(pod.Name, pod.DeletionTimestamp.Time)
+			podPort, _ := strconv.Atoi(pod.Annotations[constants.GenPortNumberAnnotation])
+			_ = r.PortAllocator.ReleaseHostPort(pod.Spec.NodeName, podPort)
 		}
 
 		// Handle our GPU resource cleanup finalizer
@@ -127,12 +132,7 @@ func (r *TensorFusionWorkloadReconciler) Reconcile(ctx context.Context, req ctrl
 		return ctrl.Result{RequeueAfter: constants.PendingRequeueDuration}, nil
 	}
 
-	// init metrics map if needed
-	now := time.Now()
-	for i := range podList.Items {
-		pod := &podList.Items[i]
-		metrics.SetWorkerMetricsByWorkload(pod, workload, now)
-	}
+	handleMetricsRecorder(podList, workload)
 
 	// Fetch the GPUPool
 	pool := &tfv1.GPUPool{}
@@ -228,14 +228,22 @@ func (r *TensorFusionWorkloadReconciler) Reconcile(ctx context.Context, req ctrl
 	return ctrl.Result{}, nil
 }
 
+func handleMetricsRecorder(podList *corev1.PodList, workload *tfv1.TensorFusionWorkload) {
+	now := time.Now()
+	for i := range podList.Items {
+		pod := &podList.Items[i]
+		metrics.SetWorkerMetricsByWorkload(pod, workload, now)
+	}
+}
+
 func (r *TensorFusionWorkloadReconciler) tryStartWorker(
 	ctx context.Context,
 	workerGenerator *worker.WorkerGenerator,
 	gpu *tfv1.GPU,
 	workload *tfv1.TensorFusionWorkload,
 	hash string,
 ) (*corev1.Pod, error) {
-	port, err := r.PortAllocator.GetHostPort(gpu.Status.NodeSelector[constants.KubernetesHostNameLabel])
+	port, err := r.PortAllocator.AssignHostPort(gpu.Status.NodeSelector[constants.KubernetesHostNameLabel])
 	if err != nil {
 		return nil, fmt.Errorf("get host port %w", err)
 	}
@@ -270,7 +278,7 @@ func (r *TensorFusionWorkloadReconciler) scaleDownWorkers(ctx context.Context, w
 
 	for i := range pods {
 		podToDelete := &pods[i]
-		log.Info("Scaling down worker pod", "name", podToDelete.Name)
+		log.Info("Scaling down worker pod", "name", podToDelete.Name, "workload", workload.Name)
 		// Delete the pod with foreground deletion policy
 		// The finalizer will handle GPU resource cleanup
 		if err := r.deletePod(ctx, podToDelete); err != nil {
diff --git a/internal/controller/workloadprofile_controller.go b/internal/controller/workloadprofile_controller.go
@@ -37,20 +37,9 @@ type WorkloadProfileReconciler struct {
 // +kubebuilder:rbac:groups=tensor-fusion.ai,resources=workloadprofiles/status,verbs=get;update;patch
 // +kubebuilder:rbac:groups=tensor-fusion.ai,resources=workloadprofiles/finalizers,verbs=update
 
-// Reconcile is part of the main kubernetes reconciliation loop which aims to
-// move the current state of the cluster closer to the desired state.
-// TODO(user): Modify the Reconcile function to compare the state specified by
-// the WorkloadProfile object against the actual cluster state, and then
-// perform operations to make the cluster state reflect the state specified by
-// the user.
-//
-// For more details, check Reconcile and its Result here:
-// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.19.1/pkg/reconcile
+// WorkloadProfile is a template to be referred by TensorFusionWorkload, no logic for reconcile
 func (r *WorkloadProfileReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
 	_ = log.FromContext(ctx)
-
-	// TODO(user): your logic here
-
 	return ctrl.Result{}, nil
 }
 
diff --git a/internal/portallocator/portallocator.go b/internal/portallocator/portallocator.go
diff --git a/internal/webhook/v1/pod_webhook.go b/internal/webhook/v1/pod_webhook.go
diff --git a/internal/webhook/v1/webhook_suite_test.go b/internal/webhook/v1/webhook_suite_test.go
diff --git a/internal/worker/worker.go b/internal/worker/worker.go

Original file line number	Diff line number	Diff line change
`@@ -420,6 +420,7 @@ func (r *TensorFusionClusterReconciler) SetupWithManager(mgr ctrl.Manager) error`
`420`	`420`
`421`	`421`	`// Update metrics recorder's raw billing map`
`422`	`422`	`func (r TensorFusionClusterReconciler) updateMetricsRecorder(ctx context.Context, pool tfv1.GPUPool) {`
	`423`	`+ log := log.FromContext(ctx)`
`423`	`424`	`qosConfig := pool.Spec.QosConfig`
`424`	`425`	`if _, ok := r.MetricsRecorder.WorkerUnitPriceMap[pool.Name]; !ok {`
`425`	`426`	`r.MetricsRecorder.WorkerUnitPriceMap[pool.Name] = make(map[string]metrics.RawBillingPricing)`
`@@ -438,4 +439,6 @@ func (r *TensorFusionClusterReconciler) updateMetricsRecorder(ctx context.Contex`
`438`	`439`	`VramOverRequestPerSecond: vramPerSecond / 3600 * limitOverRequestChargingRatio,`
`439`	`440`	`}`
`440`	`441`	`}`
	`442`	`+`
	`443`	`+ log.V(5).Info("Updated metrics recorder", "pool", pool.Name, "pricing", pricingDetail)`
`441`	`444`	`}`