fix: hypervisor permission issue; assign port from leader

Code2Life · Code2Life · commit b94e596118eb · 2025-05-31T14:47:40.000+08:00
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -6,6 +6,7 @@
         "AMDCDNA",
         "AMDRDNA",
         "apimachinery",
+        "automount",
         "AWSGPU",
         "batchv",
         "burstable",
@@ -41,6 +42,7 @@
         "greptime",
         "greptimedb",
         "healthz",
+        "iface",
         "karpenter",
         "kubebuilder",
         "KUBECONFIG",
@@ -71,6 +73,7 @@
         "tensorfusionaiv",
         "tensorfusioncluster",
         "tensorfusionclusters",
+        "tensorfusionworkload",
         "Tera",
         "tflops",
         "Tmpl",
diff --git a/charts/tensor-fusion/Chart.yaml b/charts/tensor-fusion/Chart.yaml
@@ -15,7 +15,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 1.3.1
+version: 1.3.2
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
diff --git a/charts/tensor-fusion/templates/controller-deployment.yaml b/charts/tensor-fusion/templates/controller-deployment.yaml
@@ -91,6 +91,9 @@ spec:
             requests:
               cpu: 50m
               memory: 64Mi
+            limits:
+              cpu: 1000m
+              memory: 512Mi
           volumeMounts:
             - name: logs
               mountPath: /logs
diff --git a/charts/tensor-fusion/templates/rbac-hypervisor.yaml b/charts/tensor-fusion/templates/rbac-hypervisor.yaml
@@ -0,0 +1,14 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: tensor-fusion-hypervisor-role
+rules:
+- apiGroups:
+  - ""
+  resources:
+  - nodes
+  - pods
+  verbs:
+  - get
+  - list
+  - watch
diff --git a/charts/tensor-fusion/templates/serviceaccount-hypervisor.yaml b/charts/tensor-fusion/templates/serviceaccount-hypervisor.yaml
@@ -0,0 +1,10 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  # Service account for watch vGPU worker auto scaling event and collect Pod log metadata
+  # The name is fixed and only needs pods/nodes read permission
+  name: tensor-fusion-hypervisor-sa
+  namespace: {{ include "tensor-fusion.namespace" . }}
+  labels:
+    {{- include "tensor-fusion.labels" . | nindent 4 }}
+automountServiceAccountToken: true
diff --git a/cmd/main.go b/cmd/main.go
@@ -347,7 +347,12 @@ func main() {
 		setupLog.Error(err, "failed to create connection router")
 		os.Exit(1)
 	}
-	httpServer := server.NewHTTPServer(connectionRouter)
+	assignHostPortRouter, err := router.NewAssignHostPortRouter(ctx, portAllocator)
+	if err != nil {
+		setupLog.Error(err, "failed to create assign host port router")
+		os.Exit(1)
+	}
+	httpServer := server.NewHTTPServer(connectionRouter, assignHostPortRouter)
 	go func() {
 		err := httpServer.Run()
 		if err != nil {
diff --git a/internal/constants/constants.go b/internal/constants/constants.go
@@ -85,8 +85,9 @@ const (
 	NamespaceEnv               = "OPERATOR_NAMESPACE"
 	NamespaceDefaultVal        = "tensor-fusion-sys"
 
-	KubernetesHostNameLabel = "kubernetes.io/hostname"
-	GiBToBytes              = 1024 * 1024 * 1024
+	KubernetesHostNameLabel      = "kubernetes.io/hostname"
+	GiBToBytes                   = 1024 * 1024 * 1024
+	HypervisorServiceAccountName = "tensor-fusion-hypervisor-sa"
 )
 
 const (
@@ -138,3 +139,8 @@ const (
 const TFDataPath = "/tmp/tensor-fusion/data"
 const DataVolumeName = "tf-data"
 const TensorFusionPoolManualCompaction = Domain + "/manual-compaction"
+
+const (
+	LeaderInfoConfigMapName        = "tensor-fusion-operator-leader-info"
+	LeaderInfoConfigMapLeaderIPKey = "leader-ip"
+)
diff --git a/internal/controller/gpunode_controller.go b/internal/controller/gpunode_controller.go
@@ -469,6 +469,7 @@ func (r *GPUNodeReconciler) createHypervisorPod(ctx context.Context, key client.
 		ReadOnly:  false,
 		MountPath: constants.TFDataPath,
 	})
+	spec.ServiceAccountName = constants.HypervisorServiceAccountName
 	newPod := &corev1.Pod{
 		ObjectMeta: metav1.ObjectMeta{
 			Name:      key.Name,
diff --git a/internal/portallocator/portallocator.go b/internal/portallocator/portallocator.go
@@ -9,39 +9,33 @@ import (
 	"sync"
 
 	"github.com/NexusGPU/tensor-fusion/internal/constants"
+	"github.com/NexusGPU/tensor-fusion/internal/utils"
 	v1 "k8s.io/api/core/v1"
+	"k8s.io/client-go/util/retry"
 
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"sigs.k8s.io/controller-runtime/pkg/client"
+	"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
 	"sigs.k8s.io/controller-runtime/pkg/log"
 	"sigs.k8s.io/controller-runtime/pkg/manager"
 )
 
-// offer API for host port allocation, range from user configured port range
-// when started, fetch all allocated TENSOR_FUSION_WORKER_PORT
-
-// - label: tensor-fusion.ai/component=worker
-//   portStart: 40000
-//   portEnd: 41000
-//   byNode: true
-// - label: tensor-fusion.ai/workload-type=lab
-//   portStart: 41001
-//   portEnd: 60000
-//   byNode: false
-
-// PodLabel => NodeName => HostPort
-// Annotation: tensor-fusion.ai/host-port: assigned port
-// Annotation: tensor-fusion.ai/host-port: assigned pod name
+// Offer API for host port allocation, range from user configured port range
+// Use label: `tensor-fusion.ai/host-port: auto` to assigned port at cluster level
+// vGPU worker's hostPort will be managed by operator
 type PortAllocator struct {
 	PortRangeStartNode int
 	PortRangeEndNode   int
 
 	PortRangeStartCluster int
 	PortRangeEndCluster   int
 
-	client client.Client
+	IsLeader bool
 
 	BitmapPerNode map[string][]uint64
 	BitmapCluster []uint64
+
+	Client client.Client
 }
 
 var storeMutexNode sync.RWMutex
@@ -72,10 +66,10 @@ func NewPortAllocator(ctx context.Context, client client.Client, nodeLevelPortRa
 		PortRangeEndNode:      portRangeEndNode,
 		PortRangeStartCluster: portRangeStartCluster,
 		PortRangeEndCluster:   portRangeEndCluster,
-		client:                client,
-
-		BitmapPerNode: make(map[string][]uint64),
-		BitmapCluster: make([]uint64, (portRangeEndCluster-portRangeStartCluster)/64+1),
+		Client:                client,
+		IsLeader:              false,
+		BitmapPerNode:         make(map[string][]uint64),
+		BitmapCluster:         make([]uint64, (portRangeEndCluster-portRangeStartCluster)/64+1),
 	}
 
 	return allocator, nil
@@ -85,6 +79,23 @@ func (s *PortAllocator) SetupWithManager(ctx context.Context, mgr manager.Manage
 	go func() {
 		<-mgr.Elected()
 
+		s.IsLeader = true
+		leaderInfo := &v1.ConfigMap{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      constants.LeaderInfoConfigMapName,
+				Namespace: utils.CurrentNamespace(),
+			},
+		}
+		retry.RetryOnConflict(retry.DefaultBackoff, func() error {
+			_, err := controllerutil.CreateOrUpdate(ctx, s.Client, leaderInfo, func() error {
+				leaderInfo.Data = map[string]string{
+					constants.LeaderInfoConfigMapLeaderIPKey: utils.CurrentIP(),
+				}
+				return nil
+			})
+			return err
+		})
+
 		storeMutexNode.Lock()
 		storeMutexCluster.Lock()
 		defer storeMutexNode.Unlock()
@@ -98,6 +109,18 @@ func (s *PortAllocator) SetupWithManager(ctx context.Context, mgr manager.Manage
 	}()
 }
 
+func (s *PortAllocator) GetLeaderIP() string {
+	leaderInfo := &v1.ConfigMap{}
+	s.Client.Get(context.Background(), client.ObjectKey{
+		Name:      constants.LeaderInfoConfigMapName,
+		Namespace: utils.CurrentNamespace(),
+	}, leaderInfo)
+	if leaderInfo.Data == nil {
+		return ""
+	}
+	return leaderInfo.Data[constants.LeaderInfoConfigMapLeaderIPKey]
+}
+
 // AssignHostPort always called by operator itself, thus no Leader-Follower inconsistency issue
 func (s *PortAllocator) AssignHostPort(nodeName string) (int, error) {
 	if nodeName == "" {
@@ -179,7 +202,7 @@ func (s *PortAllocator) ReleaseClusterLevelHostPort(podName string, port int) er
 func (s *PortAllocator) initBitMapForClusterLevelPortAssign(ctx context.Context) {
 	log := log.FromContext(ctx)
 	podList := &v1.PodList{}
-	err := s.client.List(ctx, podList, client.MatchingLabels{constants.GenHostPortLabel: constants.GenHostPortLabelValue})
+	err := s.Client.List(ctx, podList, client.MatchingLabels{constants.GenHostPortLabel: constants.GenHostPortLabelValue})
 	if err != nil {
 		log.Error(err, "failed to list pods with port allocation label")
 		return
@@ -199,7 +222,7 @@ func (s *PortAllocator) initBitMapForClusterLevelPortAssign(ctx context.Context)
 func (s *PortAllocator) initBitMapForNodeLevelPortAssign(ctx context.Context) {
 	log := log.FromContext(ctx)
 	podList := &v1.PodList{}
-	err := s.client.List(ctx, podList, client.MatchingLabels{constants.LabelComponent: constants.ComponentWorker})
+	err := s.Client.List(ctx, podList, client.MatchingLabels{constants.LabelComponent: constants.ComponentWorker})
 	if err != nil {
 		log.Error(err, "failed to list pods with port allocation label")
 		return
diff --git a/internal/server/router/assign_host_port.go b/internal/server/router/assign_host_port.go
@@ -0,0 +1,33 @@
+package router
+
+import (
+	"context"
+	"fmt"
+	"net/http"
+
+	"github.com/NexusGPU/tensor-fusion/internal/portallocator"
+	"github.com/gin-gonic/gin"
+	"sigs.k8s.io/controller-runtime/pkg/log"
+)
+
+type AssignHostPortRouter struct {
+	allocator *portallocator.PortAllocator
+}
+
+func NewAssignHostPortRouter(ctx context.Context, allocator *portallocator.PortAllocator) (*AssignHostPortRouter, error) {
+	return &AssignHostPortRouter{allocator: allocator}, nil
+}
+
+func (r *AssignHostPortRouter) AssignHostPort(ctx *gin.Context) {
+	// TODO verify service account token, issuer must be the same as current instance
+	// namely the request must comes from peer operator Pod
+
+	podName := ctx.Query("podName")
+	port, err := r.allocator.AssignClusterLevelHostPort(podName)
+	if err != nil {
+		ctx.String(http.StatusInternalServerError, err.Error())
+		return
+	}
+	log.FromContext(ctx).Info("assigned host port", "podName", podName, "port", port)
+	ctx.String(http.StatusOK, fmt.Sprintf("%d", port))
+}
diff --git a/internal/server/server.go b/internal/server/server.go
@@ -8,6 +8,7 @@ import (
 
 func NewHTTPServer(
 	cr *router.ConnectionRouter,
+	ahp *router.AssignHostPortRouter,
 ) *gin.Engine {
 
 	r := gin.New()
@@ -17,5 +18,6 @@ func NewHTTPServer(
 
 	apiGroup := r.Group("/api")
 	apiGroup.GET("/connection", cr.Get)
+	apiGroup.POST("/assign-host-port", ahp.AssignHostPort)
 	return r
 }
diff --git a/internal/utils/net.go b/internal/utils/net.go
@@ -0,0 +1,37 @@
+package utils
+
+import "net"
+
+func CurrentIP() string {
+	interfaces, err := net.Interfaces()
+	if err != nil {
+		panic(err)
+	}
+
+	for _, iface := range interfaces {
+		if iface.Flags&net.FlagUp == 0 || iface.Flags&net.FlagLoopback != 0 {
+			continue
+		}
+
+		addrs, err := iface.Addrs()
+		if err != nil {
+			continue
+		}
+
+		for _, addr := range addrs {
+			ipNet, ok := addr.(*net.IPNet)
+			if !ok {
+				continue
+			}
+
+			ip := ipNet.IP
+			if ip.IsLoopback() || ip.To4() == nil {
+				continue
+			}
+
+			return ip.String()
+		}
+	}
+
+	panic("no internal IP address found")
+}
diff --git a/internal/webhook/v1/pod_webhook.go b/internal/webhook/v1/pod_webhook.go