diff --git a/.gitignore b/.gitignore
index 97a779a7..d1783912 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,6 +12,7 @@ Dockerfile.cross
 
 # Output of the go coverage tool, specifically when used with LiteIDE
 *.out
+cover.out.*
 
 # Go workspace file
 go.work
diff --git a/.vscode/launch.json b/.vscode/launch.json
index e34c9fad..f14afdd2 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -55,7 +55,10 @@
             "type": "go",
             "request": "launch",
             "mode": "test",
-            "program": "${workspaceFolder}",
+            "env": {
+                "GO_TESTING": "true"
+            },
+            "program": "${workspaceFolder}/internal/controller",
             "console": "integratedTerminal"
         }
     ]
diff --git a/.vscode/settings.json b/.vscode/settings.json
index ec8decc1..4a4e6403 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,12 +1,15 @@
 {
     "cSpell.words": [
+        "alertmanager",
         "alicloud",
         "Aliyun",
         "AMDCDNA",
         "AMDRDNA",
         "apimachinery",
+        "automount",
         "AWSGPU",
         "batchv",
+        "burstable",
         "CDNA",
         "certificaterequests",
         "certmanager",
@@ -39,6 +42,7 @@
         "greptime",
         "greptimedb",
         "healthz",
+        "iface",
         "karpenter",
         "kubebuilder",
         "KUBECONFIG",
@@ -51,6 +55,7 @@
         "NVML",
         "omitempty",
         "onsi",
+        "portallocator",
         "printcolumn",
         "prometheusagents",
         "prometheuses",
@@ -62,11 +67,13 @@
         "schedulingconfigtemplates",
         "schedulingcorev",
         "shirou",
+        "strategicpatches",
         "subresource",
         "tensorfusion",
         "tensorfusionaiv",
         "tensorfusioncluster",
         "tensorfusionclusters",
+        "tensorfusionworkload",
         "Tera",
         "tflops",
         "Tmpl",
diff --git a/Makefile b/Makefile
index 28c3dd41..73c5441d 100644
--- a/Makefile
+++ b/Makefile
@@ -62,13 +62,8 @@ vet: ## Run go vet against code.
 
 .PHONY: test
 test: manifests generate fmt vet envtest ## Run tests.
-	KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $$(go list ./... | grep -v /e2e) -timeout 0 -coverprofile cover.out
+	KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" GO_TESTING=true go run github.com/onsi/ginkgo/v2/ginkgo -p -timeout 0 -cover -coverprofile cover.out -r --skip-file ./test/e2e
 
-# TODO(user): To use a different vendor for e2e tests, modify the setup under 'tests/e2e'.
-# The default setup assumes Kind is pre-installed and builds/loads the Manager Docker image locally.
-# Prometheus and CertManager are installed by default; skip with:
-# - PROMETHEUS_INSTALL_SKIP=true
-# - CERT_MANAGER_INSTALL_SKIP=true
 .PHONY: test-e2e
 test-e2e: manifests generate fmt vet ## Run the e2e tests. Expected an isolated environment using Kind.
 	@command -v kind >/dev/null 2>&1 || { \
diff --git a/api/v1/gpu_types.go b/api/v1/gpu_types.go
index f42b1db5..5a073232 100644
--- a/api/v1/gpu_types.go
+++ b/api/v1/gpu_types.go
@@ -36,6 +36,18 @@ type GPUStatus struct {
 	GPUModel     string            `json:"gpuModel"`
 
 	Message string `json:"message"`
+
+	// +optional
+	RunningApps []*RunningAppDetail `json:"runningApps,omitempty"`
+}
+
+type RunningAppDetail struct {
+	// Workload name namespace
+	Name      string `json:"name,omitempty"`
+	Namespace string `json:"namespace,omitempty"`
+
+	// Worker count
+	Count int `json:"count"`
 }
 
 // +kubebuilder:validation:Enum=Pending;Provisioning;Running;Unknown;Destroying;Migrating
diff --git a/api/v1/gpunode_funcs.go b/api/v1/gpunode_funcs.go
index 39225880..21f73af9 100644
--- a/api/v1/gpunode_funcs.go
+++ b/api/v1/gpunode_funcs.go
@@ -12,7 +12,7 @@ func (node *GPUNode) InitializeStatus(initTFlops, initVRAM resource.Quantity, in
 		TotalTFlops:         initTFlops,
 		TotalVRAM:           initVRAM,
 		TotalGPUs:           initGPUs,
-		AllocationDetails:   &[]GPUNodeAllocationDetails{},
+		AllocationInfo:      []*RunningAppDetail{},
 		LoadedModels:        &[]string{},
 		ManagedGPUDeviceIDs: []string{},
 		ObservedGeneration:  node.Generation,
diff --git a/api/v1/gpunode_types.go b/api/v1/gpunode_types.go
index 7423bf4a..a2a7ee08 100644
--- a/api/v1/gpunode_types.go
+++ b/api/v1/gpunode_types.go
@@ -94,20 +94,8 @@ type GPUNodeStatus struct {
 
 	ObservedGeneration int64 `json:"observedGeneration,omitempty"`
 
-	// Allocation details is for node compaction, and calculate used apps
 	// +optional
-	AllocationDetails *[]GPUNodeAllocationDetails `json:"allocationDetails,omitempty"`
-}
-
-type GPUNodeAllocationDetails struct {
-	PodID        string `json:"podID,omitempty"`
-	PodName      string `json:"podName,omitempty"`
-	Namespace    string `json:"namespace"`
-	WorkloadName string `json:"workload,omitempty"`
-
-	Requests GPUResourceUnit `json:"requests"`
-	Limits   GPUResourceUnit `json:"limits"`
-	QoS      QoSLevel        `json:"qos,omitempty"`
+	AllocationInfo []*RunningAppDetail `json:"allocationInfo,omitempty"`
 }
 
 // +kubebuilder:validation:Enum=Pending;Provisioning;Migrating;Running;Succeeded;Failed;Unknown;Destroying
diff --git a/api/v1/gpupool_types.go b/api/v1/gpupool_types.go
index db8b8989..bc7cd2bd 100644
--- a/api/v1/gpupool_types.go
+++ b/api/v1/gpupool_types.go
@@ -293,7 +293,7 @@ type QosPricing struct {
 
 	Requests GPUResourcePricingUnit `json:"requests,omitempty"`
 
-	// Default requests and limitsOverRequests are same, indicates normal on-demand serverless GPU usage, in hands-on lab low QoS case, limitsOverRequests should be cheaper, for example Low QoS, ratio should be 0.5
+	// Default requests and limitsOverRequests are same, indicates normal on-demand serverless GPU usage, in hands-on lab low QoS case, limitsOverRequests should be lower, so that user can get burstable GPU resources with very low cost
 	// +kubebuilder:default="1"
 	LimitsOverRequestsChargingRatio string `json:"limitsOverRequests,omitempty"`
 }
@@ -372,6 +372,8 @@ type GPUPoolStatus struct {
 	AvailableTFlops resource.Quantity `json:"availableTFlops"`
 	AvailableVRAM   resource.Quantity `json:"availableVRAM"`
 
+	RunningAppsCnt int32 `json:"runningAppsCnt,omitempty"`
+
 	// +optional
 	VirtualAvailableTFlops *resource.Quantity `json:"virtualAvailableTFlops,omitempty"`
 	// +optional
diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go
index 6d16045a..c20d35b8 100644
--- a/api/v1/zz_generated.deepcopy.go
+++ b/api/v1/zz_generated.deepcopy.go
@@ -448,23 +448,6 @@ func (in *GPUNode) DeepCopyObject() runtime.Object {
 	return nil
 }
 
-// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
-func (in *GPUNodeAllocationDetails) DeepCopyInto(out *GPUNodeAllocationDetails) {
-	*out = *in
-	in.Requests.DeepCopyInto(&out.Requests)
-	in.Limits.DeepCopyInto(&out.Limits)
-}
-
-// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeAllocationDetails.
-func (in *GPUNodeAllocationDetails) DeepCopy() *GPUNodeAllocationDetails {
-	if in == nil {
-		return nil
-	}
-	out := new(GPUNodeAllocationDetails)
-	in.DeepCopyInto(out)
-	return out
-}
-
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *GPUNodeClass) DeepCopyInto(out *GPUNodeClass) {
 	*out = *in
@@ -704,14 +687,14 @@ func (in *GPUNodeStatus) DeepCopyInto(out *GPUNodeStatus) {
 		*out = make([]string, len(*in))
 		copy(*out, *in)
 	}
-	if in.AllocationDetails != nil {
-		in, out := &in.AllocationDetails, &out.AllocationDetails
-		*out = new([]GPUNodeAllocationDetails)
-		if **in != nil {
-			in, out := *in, *out
-			*out = make([]GPUNodeAllocationDetails, len(*in))
-			for i := range *in {
-				(*in)[i].DeepCopyInto(&(*out)[i])
+	if in.AllocationInfo != nil {
+		in, out := &in.AllocationInfo, &out.AllocationInfo
+		*out = make([]*RunningAppDetail, len(*in))
+		for i := range *in {
+			if (*in)[i] != nil {
+				in, out := &(*in)[i], &(*out)[i]
+				*out = new(RunningAppDetail)
+				**out = **in
 			}
 		}
 	}
@@ -961,6 +944,17 @@ func (in *GPUStatus) DeepCopyInto(out *GPUStatus) {
 			(*out)[key] = val
 		}
 	}
+	if in.RunningApps != nil {
+		in, out := &in.RunningApps, &out.RunningApps
+		*out = make([]*RunningAppDetail, len(*in))
+		for i := range *in {
+			if (*in)[i] != nil {
+				in, out := &(*in)[i], &(*out)[i]
+				*out = new(RunningAppDetail)
+				**out = **in
+			}
+		}
+	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUStatus.
@@ -1597,6 +1591,21 @@ func (in *Resources) DeepCopy() *Resources {
 	return out
 }
 
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *RunningAppDetail) DeepCopyInto(out *RunningAppDetail) {
+	*out = *in
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RunningAppDetail.
+func (in *RunningAppDetail) DeepCopy() *RunningAppDetail {
+	if in == nil {
+		return nil
+	}
+	out := new(RunningAppDetail)
+	in.DeepCopyInto(out)
+	return out
+}
+
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *ScaleToZero) DeepCopyInto(out *ScaleToZero) {
 	*out = *in
diff --git a/charts/tensor-fusion/Chart.yaml b/charts/tensor-fusion/Chart.yaml
index 9fe41356..775a3726 100644
--- a/charts/tensor-fusion/Chart.yaml
+++ b/charts/tensor-fusion/Chart.yaml
@@ -15,10 +15,10 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 1.2.22
+version: 1.3.2
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
 # follow Semantic Versioning. They should reflect the version the application is using.
 # It is recommended to use it with quotes.
-appVersion: "1.12.1"
+appVersion: "1.30.3"
diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_gpunodes.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_gpunodes.yaml
index 6cd2c886..c798cb3c 100644
--- a/charts/tensor-fusion/crds/tensor-fusion.ai_gpunodes.yaml
+++ b/charts/tensor-fusion/crds/tensor-fusion.ai_gpunodes.yaml
@@ -86,66 +86,19 @@ spec:
           status:
             description: GPUNodeStatus defines the observed state of GPUNode.
             properties:
-              allocationDetails:
-                description: Allocation details is for node compaction, and calculate
-                  used apps
+              allocationInfo:
                 items:
                   properties:
-                    limits:
-                      properties:
-                        tflops:
-                          anyOf:
-                          - type: integer
-                          - type: string
-                          description: Tera floating point operations per second
-                          pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
-                          x-kubernetes-int-or-string: true
-                        vram:
-                          anyOf:
-                          - type: integer
-                          - type: string
-                          description: VRAM is short for Video memory, namely GPU
-                            RAM
-                          pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
-                          x-kubernetes-int-or-string: true
-                      type: object
-                    namespace:
-                      type: string
-                    podID:
-                      type: string
-                    podName:
-                      type: string
-                    qos:
-                      enum:
-                      - low
-                      - medium
-                      - high
-                      - critical
+                    count:
+                      description: Worker count
+                      type: integer
+                    name:
+                      description: Workload name namespace
                       type: string
-                    requests:
-                      properties:
-                        tflops:
-                          anyOf:
-                          - type: integer
-                          - type: string
-                          description: Tera floating point operations per second
-                          pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
-                          x-kubernetes-int-or-string: true
-                        vram:
-                          anyOf:
-                          - type: integer
-                          - type: string
-                          description: VRAM is short for Video memory, namely GPU
-                            RAM
-                          pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
-                          x-kubernetes-int-or-string: true
-                      type: object
-                    workload:
+                    namespace:
                       type: string
                   required:
-                  - limits
-                  - namespace
-                  - requests
+                  - count
                   type: object
                 type: array
               availableTFlops:
diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_gpupools.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_gpupools.yaml
index c2257300..781418d0 100644
--- a/charts/tensor-fusion/crds/tensor-fusion.ai_gpupools.yaml
+++ b/charts/tensor-fusion/crds/tensor-fusion.ai_gpupools.yaml
@@ -546,7 +546,8 @@ spec:
                           description: Default requests and limitsOverRequests are
                             same, indicates normal on-demand serverless GPU usage,
                             in hands-on lab low QoS case, limitsOverRequests should
-                            be cheaper, for example Low QoS, ratio should be 0.5
+                            be lower, so that user can get burstable GPU resources
+                            with very low cost
                           type: string
                         qos:
                           enum:
@@ -704,6 +705,9 @@ spec:
               readyNodes:
                 format: int32
                 type: integer
+              runningAppsCnt:
+                format: int32
+                type: integer
               savedCostsPerMonth:
                 type: string
               totalGPUs:
diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_gpus.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_gpus.yaml
index ace87bc4..09a14f86 100644
--- a/charts/tensor-fusion/crds/tensor-fusion.ai_gpus.yaml
+++ b/charts/tensor-fusion/crds/tensor-fusion.ai_gpus.yaml
@@ -116,6 +116,21 @@ spec:
                 - Destroying
                 - Migrating
                 type: string
+              runningApps:
+                items:
+                  properties:
+                    count:
+                      description: Worker count
+                      type: integer
+                    name:
+                      description: Workload name namespace
+                      type: string
+                    namespace:
+                      type: string
+                  required:
+                  - count
+                  type: object
+                type: array
               uuid:
                 type: string
             required:
diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionclusters.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionclusters.yaml
index b7bc95c5..3cb00209 100644
--- a/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionclusters.yaml
+++ b/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionclusters.yaml
@@ -650,8 +650,8 @@ spec:
                                     description: Default requests and limitsOverRequests
                                       are same, indicates normal on-demand serverless
                                       GPU usage, in hands-on lab low QoS case, limitsOverRequests
-                                      should be cheaper, for example Low QoS, ratio
-                                      should be 0.5
+                                      should be lower, so that user can get burstable
+                                      GPU resources with very low cost
                                     type: string
                                   qos:
                                     enum:
diff --git a/charts/tensor-fusion/templates/admission-webhooks/mutating-webhook.yaml b/charts/tensor-fusion/templates/admission-webhooks/mutating-webhook.yaml
index 5ea83d82..581e4ce2 100644
--- a/charts/tensor-fusion/templates/admission-webhooks/mutating-webhook.yaml
+++ b/charts/tensor-fusion/templates/admission-webhooks/mutating-webhook.yaml
@@ -22,6 +22,7 @@ webhooks:
     resources:
     - pods
   sideEffects: None
+  timeoutSeconds: 30
   objectSelector:
     matchExpressions:
       - key: tensor-fusion.ai/enabled
diff --git a/charts/tensor-fusion/templates/controller-deployment.yaml b/charts/tensor-fusion/templates/controller-deployment.yaml
index 92a57194..14301a66 100644
--- a/charts/tensor-fusion/templates/controller-deployment.yaml
+++ b/charts/tensor-fusion/templates/controller-deployment.yaml
@@ -4,6 +4,7 @@ metadata:
   name: {{ include "tensor-fusion.fullname" . }}-controller
   namespace: {{ include "tensor-fusion.namespace" . }}
   labels:
+    tensor-fusion.ai/component: operator
     {{- include "tensor-fusion.labels" . | nindent 4 }}
 spec:
   replicas: {{ .Values.controller.replicaCount }}
@@ -12,6 +13,7 @@ spec:
       app.kubernetes.io/name: {{ include "tensor-fusion.name" . }}
       app.kubernetes.io/instance: {{ .Release.Name }}
       app.kubernetes.io/component: controller
+      tensor-fusion.ai/component: operator
   template:
     metadata:
       {{- with .Values.controller.podAnnotations }}
@@ -22,6 +24,7 @@ spec:
         app.kubernetes.io/name: {{ include "tensor-fusion.name" . }}
         app.kubernetes.io/instance: {{ .Release.Name }}
         app.kubernetes.io/component: controller
+        tensor-fusion.ai/component: operator
     spec:
       {{- with .Values.imagePullSecrets }}
       imagePullSecrets:
@@ -35,6 +38,7 @@ spec:
             - /manager
             - -metrics-bind-address
             - :9000
+            - -leader-elect
           livenessProbe:
             {{- toYaml .Values.controller.livenessProbe | nindent 12 }}
           readinessProbe:
@@ -54,6 +58,8 @@ spec:
             - name: cert
               readOnly: true
               mountPath: /tmp/k8s-webhook-server/serving-certs
+            - name: logs
+              mountPath: /logs
             - name: cloud-vendor-credentials
               mountPath: /tmp/secret
               readOnly: true
@@ -85,12 +91,17 @@ spec:
             requests:
               cpu: 50m
               memory: 64Mi
+            limits:
+              cpu: 1000m
+              memory: 512Mi
           volumeMounts:
             - name: logs
               mountPath: /logs
             - name: vector-config
               mountPath: /etc/vector/vector.yaml
               subPath: vector-operator.yaml
+            - name: kubernetes-logs
+              mountPath: /var/log/pods
       volumes:
         - name: cert
           secret:
@@ -115,6 +126,9 @@ spec:
           name: gpu-info
         - name: logs
           emptyDir: {}
+        - name: kubernetes-logs
+          hostPath:
+            path: /var/log/pods
       {{- with .Values.controller.affinity }}
       affinity:
         {{- toYaml . | nindent 8 }}
diff --git a/charts/tensor-fusion/templates/gpu-public-gpu-info.yaml b/charts/tensor-fusion/templates/gpu-public-gpu-info.yaml
index f39c3798..23b2cb4a 100644
--- a/charts/tensor-fusion/templates/gpu-public-gpu-info.yaml
+++ b/charts/tensor-fusion/templates/gpu-public-gpu-info.yaml
@@ -432,6 +432,12 @@ data:
       costPerHour: 0.26
       fp16TFlops: 163
 
+    - model: RTX2000Ada
+      fullModelName: "NVIDIA RTX 2000 Ada Generation"
+      vendor: NVIDIA
+      costPerHour: 0.23
+      fp16TFlops: 46
+
     # NVIDIA GTX Series
     - model: GTX1050Ti
       fullModelName: "NVIDIA GeForce GTX 1050 Ti"
diff --git a/charts/tensor-fusion/templates/greptime-standalone.yaml b/charts/tensor-fusion/templates/greptime-standalone.yaml
index 58810885..5f13c9dc 100644
--- a/charts/tensor-fusion/templates/greptime-standalone.yaml
+++ b/charts/tensor-fusion/templates/greptime-standalone.yaml
@@ -61,15 +61,18 @@ metadata:
   namespace: greptimedb
   labels:
     app.greptime.io/component: greptimedb-standalone
+    tensor-fusion.ai/component: greptimedb
 spec:
   replicas: 1
   selector:
     matchLabels:
       app.greptime.io/component: greptimedb-standalone
+      tensor-fusion.ai/component: greptimedb
   template:
     metadata:
       labels:
         app.greptime.io/component: greptimedb-standalone
+        tensor-fusion.ai/component: greptimedb
     spec:
       volumes:
         - name: logs
diff --git a/charts/tensor-fusion/templates/rbac-hypervisor.yaml b/charts/tensor-fusion/templates/rbac-hypervisor.yaml
new file mode 100644
index 00000000..5ce23a05
--- /dev/null
+++ b/charts/tensor-fusion/templates/rbac-hypervisor.yaml
@@ -0,0 +1,14 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: tensor-fusion-hypervisor-role
+rules:
+- apiGroups:
+  - ""
+  resources:
+  - nodes
+  - pods
+  verbs:
+  - get
+  - list
+  - watch
diff --git a/charts/tensor-fusion/templates/rbac.yaml b/charts/tensor-fusion/templates/rbac.yaml
index 523bf956..f2ae925b 100644
--- a/charts/tensor-fusion/templates/rbac.yaml
+++ b/charts/tensor-fusion/templates/rbac.yaml
@@ -104,6 +104,18 @@ rules:
   - patch
   - update
   - watch
+- apiGroups:
+  - coordination.k8s.io
+  resources:
+  - leases
+  verbs:
+  - get
+  - list
+  - watch
+  - create
+  - update
+  - patch
+  - delete
 - apiGroups:
   - tensor-fusion.ai
   resources:
diff --git a/charts/tensor-fusion/templates/serviceaccount-hypervisor.yaml b/charts/tensor-fusion/templates/serviceaccount-hypervisor.yaml
new file mode 100644
index 00000000..1c117826
--- /dev/null
+++ b/charts/tensor-fusion/templates/serviceaccount-hypervisor.yaml
@@ -0,0 +1,10 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  # Service account for watch vGPU worker auto scaling event and collect Pod log metadata
+  # The name is fixed and only needs pods/nodes read permission
+  name: tensor-fusion-hypervisor-sa
+  namespace: {{ include "tensor-fusion.namespace" . }}
+  labels:
+    {{- include "tensor-fusion.labels" . | nindent 4 }}
+automountServiceAccountToken: true
\ No newline at end of file
diff --git a/charts/tensor-fusion/templates/vector-config.yaml b/charts/tensor-fusion/templates/vector-config.yaml
index 2479a4a7..406de6eb 100644
--- a/charts/tensor-fusion/templates/vector-config.yaml
+++ b/charts/tensor-fusion/templates/vector-config.yaml
@@ -8,30 +8,80 @@ data:
     api:
       enabled: true
     sources:
-      controller_metrics:
-        type: prometheus_scrape
-        endpoints:
-          - http://localhost:9000/metrics
+      metrics:
+        type: file
+        data_dir: /logs
+        include:
+          - /logs/metrics*.log
+
+      kubernetes_logs:
+        type: kubernetes_logs
+        self_node_name: "${NODE_NAME}"
+        extra_label_selector: "tensor-fusion.ai/component in (operator)"
+
     transforms:
-      prepare_controller_metrics:
+      parse_influx:
         type: remap
-        inputs: 
-          - controller_metrics
+        inputs:
+          - metrics
+        source: |
+          . = parse_influxdb!(.message)
+      prepare_metrics:
+        type: remap
+        inputs:
+          - parse_influx
         source: |
           .namespace = "tf"
+          .tags.nodeName = "${NODE_NAME}"
+      
+      log_to_metric:
+        type: log_to_metric
+        inputs:
+          - prepare_metrics
+        all_metrics: true
+        metrics: []
+
+      prepare_kubernetes_logs:
+        type: remap
+        inputs:
+          - kubernetes_logs
+        source: |
+          .message = .message
+          .container = .kubernetes.container_name
+          .pod = .kubernetes.pod_name
+          .namespace = .kubernetes.pod_namespace
+          .component = .kubernetes.pod_labels."tensor-fusion.ai/component"
+          del(.kubernetes)
+          del(.file)
+          del(.source_type)
     sinks:
-      sink_greptimedb_controller_metrics:
-        type: prometheus_remote_write
+      sink_greptimedb_operator_metrics:
+        type: greptimedb_metrics
         inputs:
-          - prepare_controller_metrics
+          - log_to_metric
+        new_naming: false
+        endpoint: {{ .Values.greptime.host }}:{{ .Values.greptime.port }}
+        {{- if eq .Values.greptime.isCloud true }}
+        dbname: {{ .Values.greptime.db }}
+        username: {{ .Values.greptime.user }}
+        password: {{ .Values.greptime.password }}
+        tls: {}
+        {{- end }}
+      
+      sink_greptimedb_operator_logs:
+        type: greptimedb_logs
+        compression: gzip
+        table: tf_system_log
+        inputs:
+          - prepare_kubernetes_logs
         {{- if ne .Values.greptime.isCloud true }}
-        endpoint: http://{{ .Values.greptime.host }}:4000/v1/prometheus/write?db=public
+        endpoint: http://{{ .Values.greptime.host }}:4000
+        dbname: public
         {{- else }}
-        endpoint: https://{{ .Values.greptime.host }}/v1/prometheus/write?db={{ .Values.greptime.db }}
-        auth:
-          strategy: basic
-          user: {{ .Values.greptime.user }}
-          password: {{ .Values.greptime.password }}
+        endpoint: https://{{ .Values.greptime.host }}
+        dbname: {{ .Values.greptime.db }}
+        username: {{ .Values.greptime.user }}
+        password: {{ .Values.greptime.password }}
         {{- end }}
 
   vector-hypervisor.yaml: |
@@ -39,12 +89,15 @@ data:
       enabled: true
     
     sources:
+      kubernetes_logs:
+        type: kubernetes_logs
+        self_node_name: "${NODE_NAME}"
+        extra_label_selector: "tensor-fusion.ai/component in (hypervisor,worker)"
       metrics:
         type: file
         data_dir: /logs
         include:
-          - /logs/metrics.log
-      
+          - /logs/metrics.log.*
     transforms:
       parse_influx:
         type: remap
@@ -67,6 +120,20 @@ data:
         all_metrics: true
         metrics: []
 
+      prepare_kubernetes_logs:
+        type: remap
+        inputs:
+          - kubernetes_logs
+        source: |
+          .message = .message
+          .container = .kubernetes.container_name
+          .pod = .kubernetes.pod_name
+          .namespace = .kubernetes.pod_namespace
+          .component = .kubernetes.pod_labels."tensor-fusion.ai/component"
+          del(.kubernetes)
+          del(.file)
+          del(.source_type)
+
     sinks:
       sink_greptimedb_hypervisor_metrics:
         type: greptimedb_metrics
@@ -80,4 +147,18 @@ data:
         password: {{ .Values.greptime.password }}
         tls: {}
         {{- end }}
-
+      sink_greptimedb_hypervisor_worker_logs:
+        type: greptimedb_logs
+        compression: gzip
+        table: tf_system_log
+        inputs:
+          - prepare_kubernetes_logs
+        {{- if ne .Values.greptime.isCloud true }}
+        endpoint: http://{{ .Values.greptime.host }}:4000
+        dbname: public
+        {{- else }}
+        endpoint: https://{{ .Values.greptime.host }}
+        dbname: {{ .Values.greptime.db }}
+        username: {{ .Values.greptime.user }}
+        password: {{ .Values.greptime.password }}
+        {{- end }}
diff --git a/charts/tensor-fusion/values.schema.json b/charts/tensor-fusion/values.schema.json
index a3c112ed..6ff52262 100644
--- a/charts/tensor-fusion/values.schema.json
+++ b/charts/tensor-fusion/values.schema.json
@@ -288,7 +288,7 @@
         "cloudEndpoint": {
           "type": "string",
           "description": "WebSocket endpoint for cloud communication",
-          "default": "wss://app.tensor-fusion.ai/_ws"
+          "default": "wss://app.tensor-fusion.ai"
         },
         "image": {
           "type": "object",
diff --git a/charts/tensor-fusion/values.yaml b/charts/tensor-fusion/values.yaml
index 482e6f4f..4798299e 100644
--- a/charts/tensor-fusion/values.yaml
+++ b/charts/tensor-fusion/values.yaml
@@ -37,7 +37,18 @@ controller:
 
   podAnnotations: {}
   tolerations: []
-  affinity: {}
+  affinity:
+    podAntiAffinity:
+      requiredDuringSchedulingIgnoredDuringExecution:
+      - weight: 100
+        podAffinityTerm:
+          labelSelector:
+            matchExpressions:
+            - key: tensor-fusion.ai/component
+              operator: In
+              values:
+              - operator
+          topologyKey: "kubernetes.io/hostname"
   livenessProbe:
     httpGet:
       path: /healthz
@@ -94,7 +105,7 @@ greptime:
 agent:
   enrollToken: "token-from-cloud"
   agentId: 'org-from-cloud:env'
-  cloudEndpoint: "wss://app.tensor-fusion.ai/_ws"
+  cloudEndpoint: "wss://app.tensor-fusion.ai"
   
   image:
     repository: tensorfusion/tensor-fusion-agent
diff --git a/cmd/main.go b/cmd/main.go
index a12ae4f8..22ad1631 100644
--- a/cmd/main.go
+++ b/cmd/main.go
@@ -45,6 +45,8 @@ import (
 	"github.com/NexusGPU/tensor-fusion/internal/config"
 	"github.com/NexusGPU/tensor-fusion/internal/controller"
 	"github.com/NexusGPU/tensor-fusion/internal/gpuallocator"
+	"github.com/NexusGPU/tensor-fusion/internal/metrics"
+	"github.com/NexusGPU/tensor-fusion/internal/portallocator"
 	"github.com/NexusGPU/tensor-fusion/internal/server"
 	"github.com/NexusGPU/tensor-fusion/internal/server/router"
 	"github.com/NexusGPU/tensor-fusion/internal/version"
@@ -57,6 +59,8 @@ var (
 	setupLog = ctrl.Log.WithName("setup")
 )
 
+const LeaderElectionID = "85104305.tensor-fusion.ai"
+
 func init() {
 	utilruntime.Must(clientgoscheme.AddToScheme(scheme))
 
@@ -64,6 +68,7 @@ func init() {
 	// +kubebuilder:scaffold:scheme
 }
 
+//nolint:gocyclo
 func main() {
 	var metricsAddr string
 	var enableLeaderElection bool
@@ -72,6 +77,9 @@ func main() {
 	var enableHTTP2 bool
 	var tlsOpts []func(*tls.Config)
 	var gpuInfoConfig string
+	var metricsPath string
+	var nodeLevelPortRange string
+	var clusterLevelPortRange string
 
 	flag.StringVar(&metricsAddr, "metrics-bind-address", "0", "The address the metrics endpoint binds to. "+
 		"Use :8443 for HTTPS or :8080 for HTTP, or leave as 0 to disable the metrics service.")
@@ -85,6 +93,12 @@ func main() {
 		"If set, HTTP/2 will be enabled for the metrics and webhook servers")
 	flag.StringVar(&gpuInfoConfig, "gpu-info-config",
 		"/etc/tensor-fusion/gpu-info.yaml", "specify the path to gpuInfoConfig file")
+	flag.StringVar(&metricsPath, "metrics-path", "/logs/metrics.log", "specify the path to metrics file")
+	flag.StringVar(&nodeLevelPortRange, "host-port-range", "40000-42000",
+		"specify the port range for assigning ports to pre-scheduled Pods such as vGPU workers")
+	flag.StringVar(&clusterLevelPortRange, "cluster-host-port-range", "42000-62000",
+		"specify the port range for assigning ports to random Pods"+
+			" marked with `tensor-fusion.ai/host-port: auto` and `tensor-fusion.ai/port-name: ssh`")
 	opts := zap.Options{
 		Development: true,
 	}
@@ -120,9 +134,13 @@ func main() {
 		ctrl.Log.Error(err, "unable to read gpuInfoConfig file")
 		gpuInfos = make([]config.GpuInfo, 0)
 	}
+	gpuPricingMap := make(map[string]float64)
+	for _, gpuInfo := range gpuInfos {
+		gpuPricingMap[gpuInfo.FullModelName] = gpuInfo.CostPerHour
+	}
 
 	// Watch configMap change with interval, check lastModifiedTime to reload gpuInfoConfig
-	watchGPUInfoChanges(gpuInfoConfig, &gpuInfos)
+	watchGPUInfoChanges(gpuInfoConfig, &gpuInfos, gpuPricingMap)
 
 	// Metrics endpoint is enabled in 'config/default/kustomization.yaml'. The Metrics options configure the server.
 	// More info:
@@ -154,7 +172,7 @@ func main() {
 		WebhookServer:          webhookServer,
 		HealthProbeBindAddress: probeAddr,
 		LeaderElection:         enableLeaderElection,
-		LeaderElectionID:       "85104305.tensor-fusion.ai",
+		LeaderElectionID:       LeaderElectionID,
 		// LeaderElectionReleaseOnCancel defines if the leader should step down voluntarily
 		// when the Manager ends. This requires the binary to immediately end when the
 		// Manager is stopped, otherwise, this setting is unsafe. Setting this significantly
@@ -174,12 +192,32 @@ func main() {
 
 	ctx := context.Background()
 
+	metricsRecorder := metrics.MetricsRecorder{
+		MetricsOutputPath:  metricsPath,
+		HourlyUnitPriceMap: gpuPricingMap,
+
+		// Worker level map will be updated by cluster reconcile
+		// Key is poolName, second level key is QoS level
+		WorkerUnitPriceMap: make(map[string]map[string]metrics.RawBillingPricing),
+	}
+
+	startMetricsRecorder(enableLeaderElection, mgr, metricsRecorder)
+
 	// Initialize GPU allocator and set up watches
 	allocator := gpuallocator.NewGpuAllocator(ctx, mgr.GetClient(), 10*time.Second)
 	if _, err = allocator.SetupWithManager(ctx, mgr); err != nil {
 		setupLog.Error(err, "unable to set up GPU allocator watches")
 		os.Exit(1)
 	}
+
+	// Initialize Port allocator and set up watches
+	portAllocator, err := portallocator.NewPortAllocator(ctx, mgr.GetClient(), nodeLevelPortRange, clusterLevelPortRange)
+	if err != nil {
+		setupLog.Error(err, "unable to set up port allocator")
+		os.Exit(1)
+	}
+	_ = portAllocator.SetupWithManager(ctx, mgr)
+
 	if err = (&controller.TensorFusionConnectionReconciler{
 		Client:   mgr.GetClient(),
 		Scheme:   mgr.GetScheme(),
@@ -199,16 +237,17 @@ func main() {
 
 	// nolint:goconst
 	if os.Getenv("ENABLE_WEBHOOKS") != "false" {
-		if err = webhookcorev1.SetupPodWebhookWithManager(mgr); err != nil {
+		if err = webhookcorev1.SetupPodWebhookWithManager(mgr, portAllocator); err != nil {
 			setupLog.Error(err, "unable to create webhook", "webhook", "Pod")
 			os.Exit(1)
 		}
 	}
 
 	if err = (&controller.TensorFusionClusterReconciler{
-		Client:   mgr.GetClient(),
-		Scheme:   mgr.GetScheme(),
-		Recorder: mgr.GetEventRecorderFor("TensorFusionCluster"),
+		Client:          mgr.GetClient(),
+		Scheme:          mgr.GetScheme(),
+		Recorder:        mgr.GetEventRecorderFor("TensorFusionCluster"),
+		MetricsRecorder: &metricsRecorder,
 	}).SetupWithManager(mgr); err != nil {
 		setupLog.Error(err, "unable to create controller", "controller", "TensorFusionCluster")
 		os.Exit(1)
@@ -255,8 +294,9 @@ func main() {
 		os.Exit(1)
 	}
 	if err = (&controller.PodReconciler{
-		Client: mgr.GetClient(),
-		Scheme: mgr.GetScheme(),
+		Client:        mgr.GetClient(),
+		Scheme:        mgr.GetScheme(),
+		PortAllocator: portAllocator,
 	}).SetupWithManager(mgr); err != nil {
 		setupLog.Error(err, "unable to create controller", "controller", "Pod")
 		os.Exit(1)
@@ -278,11 +318,12 @@ func main() {
 		os.Exit(1)
 	}
 	if err = (&controller.TensorFusionWorkloadReconciler{
-		Client:    mgr.GetClient(),
-		Scheme:    mgr.GetScheme(),
-		Allocator: allocator,
-		Recorder:  mgr.GetEventRecorderFor("tensorfusionworkload"),
-		GpuInfos:  &gpuInfos,
+		Client:        mgr.GetClient(),
+		Scheme:        mgr.GetScheme(),
+		Allocator:     allocator,
+		Recorder:      mgr.GetEventRecorderFor("tensorfusionworkload"),
+		GpuInfos:      &gpuInfos,
+		PortAllocator: portAllocator,
 	}).SetupWithManager(mgr); err != nil {
 		setupLog.Error(err, "unable to create controller", "controller", "TensorFusionWorkload")
 		os.Exit(1)
@@ -308,7 +349,12 @@ func main() {
 		setupLog.Error(err, "failed to create connection router")
 		os.Exit(1)
 	}
-	httpServer := server.NewHTTPServer(connectionRouter)
+	assignHostPortRouter, err := router.NewAssignHostPortRouter(ctx, portAllocator)
+	if err != nil {
+		setupLog.Error(err, "failed to create assign host port router")
+		os.Exit(1)
+	}
+	httpServer := server.NewHTTPServer(connectionRouter, assignHostPortRouter)
 	go func() {
 		err := httpServer.Run()
 		if err != nil {
@@ -339,7 +385,18 @@ func main() {
 	}
 }
 
-func watchGPUInfoChanges(gpuInfoConfig string, gpuInfos *[]config.GpuInfo) {
+func startMetricsRecorder(enableLeaderElection bool, mgr manager.Manager, metricsRecorder metrics.MetricsRecorder) {
+	if enableLeaderElection {
+		go func() {
+			<-mgr.Elected()
+			metricsRecorder.Start()
+		}()
+	} else {
+		go metricsRecorder.Start()
+	}
+}
+
+func watchGPUInfoChanges(gpuInfoConfig string, gpuInfos *[]config.GpuInfo, gpuPricingMap map[string]float64) {
 	var lastModTime time.Time
 	if fileInfo, err := os.Stat(gpuInfoConfig); err == nil {
 		lastModTime = fileInfo.ModTime()
@@ -367,6 +424,9 @@ func watchGPUInfoChanges(gpuInfoConfig string, gpuInfos *[]config.GpuInfo) {
 				}
 
 				*gpuInfos = updatedGpuInfos
+				for _, gpuInfo := range updatedGpuInfos {
+					gpuPricingMap[gpuInfo.FullModelName] = gpuInfo.CostPerHour
+				}
 				lastModTime = currentModTime
 				ctrl.Log.Info("gpuInfo reloaded successfully.", "gpuInfoConfig", gpuInfoConfig)
 			}
diff --git a/cmd/nodediscovery/main.go b/cmd/nodediscovery/main.go
index a3accb91..3641f01f 100644
--- a/cmd/nodediscovery/main.go
+++ b/cmd/nodediscovery/main.go
@@ -22,6 +22,7 @@ import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/runtime"
 	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
+	"k8s.io/apimachinery/pkg/util/wait"
 	"k8s.io/client-go/rest"
 	"k8s.io/client-go/tools/clientcmd"
 	"k8s.io/client-go/util/retry"
@@ -200,7 +201,12 @@ func createOrUpdateTensorFusionGPU(
 		},
 	}
 
-	err := retry.OnError(retry.DefaultBackoff, func(err error) bool {
+	err := retry.OnError(wait.Backoff{
+		Steps:    10,
+		Duration: time.Second,
+		Factor:   1.0,
+		Jitter:   0.1,
+	}, func(err error) bool {
 		return true // Retry on all errors for now
 	}, func() error {
 		_, err := controllerutil.CreateOrUpdate(ctx, k8sClient, gpu, func() error {
@@ -253,6 +259,7 @@ func createOrUpdateTensorFusionGPU(
 			NodeSelector: map[string]string{
 				"kubernetes.io/hostname": k8sNodeName,
 			},
+			RunningApps: []*tfv1.RunningAppDetail{},
 		}
 
 		if gpu.Status.Available == nil {
diff --git a/config/crd/bases/tensor-fusion.ai_gpunodes.yaml b/config/crd/bases/tensor-fusion.ai_gpunodes.yaml
index 6cd2c886..c798cb3c 100644
--- a/config/crd/bases/tensor-fusion.ai_gpunodes.yaml
+++ b/config/crd/bases/tensor-fusion.ai_gpunodes.yaml
@@ -86,66 +86,19 @@ spec:
           status:
             description: GPUNodeStatus defines the observed state of GPUNode.
             properties:
-              allocationDetails:
-                description: Allocation details is for node compaction, and calculate
-                  used apps
+              allocationInfo:
                 items:
                   properties:
-                    limits:
-                      properties:
-                        tflops:
-                          anyOf:
-                          - type: integer
-                          - type: string
-                          description: Tera floating point operations per second
-                          pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
-                          x-kubernetes-int-or-string: true
-                        vram:
-                          anyOf:
-                          - type: integer
-                          - type: string
-                          description: VRAM is short for Video memory, namely GPU
-                            RAM
-                          pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
-                          x-kubernetes-int-or-string: true
-                      type: object
-                    namespace:
-                      type: string
-                    podID:
-                      type: string
-                    podName:
-                      type: string
-                    qos:
-                      enum:
-                      - low
-                      - medium
-                      - high
-                      - critical
+                    count:
+                      description: Worker count
+                      type: integer
+                    name:
+                      description: Workload name namespace
                       type: string
-                    requests:
-                      properties:
-                        tflops:
-                          anyOf:
-                          - type: integer
-                          - type: string
-                          description: Tera floating point operations per second
-                          pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
-                          x-kubernetes-int-or-string: true
-                        vram:
-                          anyOf:
-                          - type: integer
-                          - type: string
-                          description: VRAM is short for Video memory, namely GPU
-                            RAM
-                          pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
-                          x-kubernetes-int-or-string: true
-                      type: object
-                    workload:
+                    namespace:
                       type: string
                   required:
-                  - limits
-                  - namespace
-                  - requests
+                  - count
                   type: object
                 type: array
               availableTFlops:
diff --git a/config/crd/bases/tensor-fusion.ai_gpupools.yaml b/config/crd/bases/tensor-fusion.ai_gpupools.yaml
index c2257300..781418d0 100644
--- a/config/crd/bases/tensor-fusion.ai_gpupools.yaml
+++ b/config/crd/bases/tensor-fusion.ai_gpupools.yaml
@@ -546,7 +546,8 @@ spec:
                           description: Default requests and limitsOverRequests are
                             same, indicates normal on-demand serverless GPU usage,
                             in hands-on lab low QoS case, limitsOverRequests should
-                            be cheaper, for example Low QoS, ratio should be 0.5
+                            be lower, so that user can get burstable GPU resources
+                            with very low cost
                           type: string
                         qos:
                           enum:
@@ -704,6 +705,9 @@ spec:
               readyNodes:
                 format: int32
                 type: integer
+              runningAppsCnt:
+                format: int32
+                type: integer
               savedCostsPerMonth:
                 type: string
               totalGPUs:
diff --git a/config/crd/bases/tensor-fusion.ai_gpus.yaml b/config/crd/bases/tensor-fusion.ai_gpus.yaml
index ace87bc4..09a14f86 100644
--- a/config/crd/bases/tensor-fusion.ai_gpus.yaml
+++ b/config/crd/bases/tensor-fusion.ai_gpus.yaml
@@ -116,6 +116,21 @@ spec:
                 - Destroying
                 - Migrating
                 type: string
+              runningApps:
+                items:
+                  properties:
+                    count:
+                      description: Worker count
+                      type: integer
+                    name:
+                      description: Workload name namespace
+                      type: string
+                    namespace:
+                      type: string
+                  required:
+                  - count
+                  type: object
+                type: array
               uuid:
                 type: string
             required:
diff --git a/config/crd/bases/tensor-fusion.ai_tensorfusionclusters.yaml b/config/crd/bases/tensor-fusion.ai_tensorfusionclusters.yaml
index b7bc95c5..3cb00209 100644
--- a/config/crd/bases/tensor-fusion.ai_tensorfusionclusters.yaml
+++ b/config/crd/bases/tensor-fusion.ai_tensorfusionclusters.yaml
@@ -650,8 +650,8 @@ spec:
                                     description: Default requests and limitsOverRequests
                                       are same, indicates normal on-demand serverless
                                       GPU usage, in hands-on lab low QoS case, limitsOverRequests
-                                      should be cheaper, for example Low QoS, ratio
-                                      should be 0.5
+                                      should be lower, so that user can get burstable
+                                      GPU resources with very low cost
                                     type: string
                                   qos:
                                     enum:
diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml
index 82095a0f..ea3cf175 100644
--- a/config/rbac/role.yaml
+++ b/config/rbac/role.yaml
@@ -116,6 +116,18 @@ rules:
   - patch
   - update
   - watch
+- apiGroups:
+  - coordination.k8s.io
+  resources:
+  - leases
+  verbs:
+  - create
+  - delete
+  - get
+  - list
+  - patch
+  - update
+  - watch
 - apiGroups:
   - tensor-fusion.ai
   resources:
diff --git a/config/webhook/manifests.yaml b/config/webhook/manifests.yaml
index 5db5cdf3..1975a164 100644
--- a/config/webhook/manifests.yaml
+++ b/config/webhook/manifests.yaml
@@ -23,6 +23,7 @@ webhooks:
     resources:
     - pods
   sideEffects: None
+  timeoutSeconds: 30
   objectSelector:
     matchExpressions:
       - key: tensor-fusion.ai/enabled
diff --git a/go.mod b/go.mod
index 175bdb9e..4e63759a 100644
--- a/go.mod
+++ b/go.mod
@@ -11,20 +11,20 @@ require (
 	github.com/aws/smithy-go v1.22.3
 	github.com/gin-contrib/gzip v1.2.3
 	github.com/gin-gonic/gin v1.10.1
+	github.com/influxdata/line-protocol/v2 v2.2.1
 	github.com/lithammer/shortuuid/v4 v4.2.0
 	github.com/onsi/ginkgo/v2 v2.23.4
 	github.com/onsi/gomega v1.37.0
-	github.com/prometheus/client_golang v1.22.0
 	github.com/samber/lo v1.50.0
 	github.com/shirou/gopsutil v3.21.11+incompatible
 	github.com/stretchr/testify v1.10.0
-	golang.org/x/exp v0.0.0-20241217172543-b2144cdd0a67
 	gomodules.xyz/jsonpatch/v2 v2.5.0
+	gopkg.in/natefinch/lumberjack.v2 v2.2.1
 	k8s.io/api v0.33.1
 	k8s.io/apimachinery v0.33.1
 	k8s.io/client-go v0.33.1
 	k8s.io/component-helpers v0.33.1
-	k8s.io/utils v0.0.0-20241210054802-24370beab758
+	k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979
 	sigs.k8s.io/controller-runtime v0.21.0
 	sigs.k8s.io/yaml v1.4.0
 )
@@ -86,6 +86,7 @@ require (
 	github.com/pelletier/go-toml/v2 v2.2.3 // indirect
 	github.com/pkg/errors v0.9.1 // indirect
 	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
+	github.com/prometheus/client_golang v1.22.0 // indirect
 	github.com/prometheus/client_model v0.6.1 // indirect
 	github.com/prometheus/common v0.62.0 // indirect
 	github.com/prometheus/procfs v0.15.1 // indirect
@@ -109,15 +110,16 @@ require (
 	go.uber.org/multierr v1.11.0 // indirect
 	go.uber.org/zap v1.27.0 // indirect
 	golang.org/x/arch v0.15.0 // indirect
-	golang.org/x/crypto v0.36.0 // indirect
-	golang.org/x/net v0.38.0 // indirect
+	golang.org/x/crypto v0.38.0 // indirect
+	golang.org/x/exp v0.0.0-20250506013437-ce4c2cf36ca6 // indirect
+	golang.org/x/net v0.40.0 // indirect
 	golang.org/x/oauth2 v0.27.0 // indirect
-	golang.org/x/sync v0.12.0 // indirect
-	golang.org/x/sys v0.32.0 // indirect
-	golang.org/x/term v0.30.0 // indirect
-	golang.org/x/text v0.23.0 // indirect
+	golang.org/x/sync v0.14.0 // indirect
+	golang.org/x/sys v0.33.0 // indirect
+	golang.org/x/term v0.32.0 // indirect
+	golang.org/x/text v0.25.0 // indirect
 	golang.org/x/time v0.9.0 // indirect
-	golang.org/x/tools v0.31.0 // indirect
+	golang.org/x/tools v0.33.0 // indirect
 	google.golang.org/genproto/googleapis/api v0.0.0-20241223144023-3abc09e42ca8 // indirect
 	google.golang.org/genproto/googleapis/rpc v0.0.0-20241223144023-3abc09e42ca8 // indirect
 	google.golang.org/grpc v1.69.2 // indirect
diff --git a/go.sum b/go.sum
index d556026c..1248b05f 100644
--- a/go.sum
+++ b/go.sum
@@ -57,6 +57,10 @@ github.com/evanphx/json-patch/v5 v5.9.11/go.mod h1:3j+LviiESTElxA4p3EMKAB9HXj3/X
 github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
 github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
 github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k=
+github.com/frankban/quicktest v1.11.0/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s=
+github.com/frankban/quicktest v1.11.2/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s=
+github.com/frankban/quicktest v1.13.0 h1:yNZif1OkDfNoDfb9zZa9aXIpejNR4F23Wely0c+Qdqk=
+github.com/frankban/quicktest v1.13.0/go.mod h1:qLE0fzW0VuyUAJgPU19zByoIr0HtCHN/r/VLSOOIySU=
 github.com/fsnotify/fsnotify v1.8.0 h1:dAwr6QBTBZIkG8roQaJjGof0pp0EeF+tNV7YBP3F/8M=
 github.com/fsnotify/fsnotify v1.8.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0=
 github.com/fxamacker/cbor/v2 v2.7.0 h1:iM5WgngdRBanHcxugY4JySA0nk1wZorNOpTgCMedv5E=
@@ -109,7 +113,9 @@ github.com/google/cel-go v0.23.2 h1:UdEe3CvQh3Nv+E/j9r1Y//WO0K0cSyD7/y0bzyLIMI4=
 github.com/google/cel-go v0.23.2/go.mod h1:52Pb6QsDbC5kvgxvZhiL9QX1oZEkcUF/ZqaPx1J5Wwo=
 github.com/google/gnostic-models v0.6.9 h1:MU/8wDLif2qCXZmzncUQ/BOfxWfthHi63KqpoNbWqVw=
 github.com/google/gnostic-models v0.6.9/go.mod h1:CiWsm0s6BSQd1hRn8/QmxqB6BesYcbSZxsz9b0KuDBw=
+github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
 github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
 github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
@@ -126,6 +132,13 @@ github.com/grpc-ecosystem/grpc-gateway/v2 v2.25.1 h1:VNqngBF40hVlDloBruUehVYC3Ar
 github.com/grpc-ecosystem/grpc-gateway/v2 v2.25.1/go.mod h1:RBRO7fro65R6tjKzYgLAFo0t1QEXY1Dp+i/bvpRiqiQ=
 github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
 github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
+github.com/influxdata/line-protocol-corpus v0.0.0-20210519164801-ca6fa5da0184/go.mod h1:03nmhxzZ7Xk2pdG+lmMd7mHDfeVOYFyhOgwO61qWU98=
+github.com/influxdata/line-protocol-corpus v0.0.0-20210922080147-aa28ccfb8937 h1:MHJNQ+p99hFATQm6ORoLmpUCF7ovjwEFshs/NHzAbig=
+github.com/influxdata/line-protocol-corpus v0.0.0-20210922080147-aa28ccfb8937/go.mod h1:BKR9c0uHSmRgM/se9JhFHtTT7JTO67X23MtKMHtZcpo=
+github.com/influxdata/line-protocol/v2 v2.0.0-20210312151457-c52fdecb625a/go.mod h1:6+9Xt5Sq1rWx+glMgxhcg2c0DUaehK+5TDcPZ76GypY=
+github.com/influxdata/line-protocol/v2 v2.1.0/go.mod h1:QKw43hdUBg3GTk2iC3iyCxksNj7PX9aUSeYOYE/ceHY=
+github.com/influxdata/line-protocol/v2 v2.2.1 h1:EAPkqJ9Km4uAxtMRgUubJyqAr6zgWM0dznKMLRauQRE=
+github.com/influxdata/line-protocol/v2 v2.2.1/go.mod h1:DmB3Cnh+3oxmG6LOBIxce4oaL4CPj3OmMPgvauXh+tM=
 github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg=
 github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo=
 github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGwWFoC7ycTf1rcQZHOlsJ6N8=
@@ -143,6 +156,7 @@ github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa02
 github.com/klauspost/cpuid/v2 v2.2.10 h1:tBs3QSyvjDyFTq3uoc/9xFpCuOsJQFNPiAhYdw2skhE=
 github.com/klauspost/cpuid/v2 v2.2.10/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0=
 github.com/knz/go-libedit v1.10.1/go.mod h1:MZTVkCWyz0oBc7JOWP3wNAzd002ZbM/5hgShxwh4x8M=
+github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
 github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
 github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
 github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
@@ -266,15 +280,15 @@ golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACk
 golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
-golang.org/x/crypto v0.36.0 h1:AnAEvhDddvBdpY+uR+MyHmuZzzNqXSe/GvuDeob5L34=
-golang.org/x/crypto v0.36.0/go.mod h1:Y4J0ReaxCR1IMaabaSMugxJES1EpwhBHhv2bDHklZvc=
+golang.org/x/crypto v0.38.0 h1:jt+WWG8IZlBnVbomuhg2Mdq0+BBQaHbtqHEFEigjUV8=
+golang.org/x/crypto v0.38.0/go.mod h1:MvrbAqul58NNYPKnOra203SB9vpuZW0e+RRZV+Ggqjw=
 golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20191030013958-a1ab85dbe136/go.mod h1:JXzH8nQsPlswgeRAPE3MuO9GYsAcnJvJ4vnMwN/5qkY=
-golang.org/x/exp v0.0.0-20241217172543-b2144cdd0a67 h1:1UoZQm6f0P/ZO0w1Ri+f+ifG/gXhegadRdwBIXEFWDo=
-golang.org/x/exp v0.0.0-20241217172543-b2144cdd0a67/go.mod h1:qj5a5QZpwLU2NLQudwIN5koi3beDhSAlJwa67PuM98c=
+golang.org/x/exp v0.0.0-20250506013437-ce4c2cf36ca6 h1:y5zboxd6LQAqYIhHnB48p0ByQ/GnQx2BE33L8BOHQkI=
+golang.org/x/exp v0.0.0-20250506013437-ce4c2cf36ca6/go.mod h1:U6Lno4MTRCDY+Ba7aCcauB9T60gsv5s4ralQzP72ZoQ=
 golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs=
 golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js=
 golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
@@ -286,29 +300,29 @@ golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn
 golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
-golang.org/x/net v0.38.0 h1:vRMAPTMaeGqVhG5QyLJHqNDwecKTomGeqbnfZyKlBI8=
-golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8=
+golang.org/x/net v0.40.0 h1:79Xs7wF06Gbdcg4kdCCIQArK11Z1hr5POQ6+fIYHNuY=
+golang.org/x/net v0.40.0/go.mod h1:y0hY0exeL2Pku80/zKK7tpntoX23cqL3Oa6njdgRtds=
 golang.org/x/oauth2 v0.27.0 h1:da9Vo7/tDv5RH/7nZDz1eMGS/q1Vv1N/7FCrBhI9I3M=
 golang.org/x/oauth2 v0.27.0/go.mod h1:onh5ek6nERTohokkhCD/y2cV4Do3fxFHFuAejCkRWT8=
 golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.12.0 h1:MHc5BpPuC30uJk597Ri8TV3CNZcTLu6B6z4lJy+g6Jw=
-golang.org/x/sync v0.12.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
+golang.org/x/sync v0.14.0 h1:woo0S4Yywslg6hp4eUFjTVOyKt0RookbpAHG4c1HmhQ=
+golang.org/x/sync v0.14.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.32.0 h1:s77OFDvIQeibCmezSnk/q6iAfkdiQaJi4VzroCFrN20=
-golang.org/x/sys v0.32.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
-golang.org/x/term v0.30.0 h1:PQ39fJZ+mfadBm0y5WlL4vlM7Sx1Hgf13sMIY2+QS9Y=
-golang.org/x/term v0.30.0/go.mod h1:NYYFdzHoI5wRh/h5tDMdMqCqPJZEuNqVR5xJLd/n67g=
+golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw=
+golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
+golang.org/x/term v0.32.0 h1:DR4lr0TjUs3epypdhTOkMmuF5CDFJ/8pOnbzMZPQ7bg=
+golang.org/x/term v0.32.0/go.mod h1:uZG1FhGx848Sqfsq4/DlJr3xGGsYMu/L5GW4abiaEPQ=
 golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY=
-golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4=
+golang.org/x/text v0.25.0 h1:qVyWApTSYLk/drJRO5mDlNYskwQznZmkpV2c8q9zls4=
+golang.org/x/text v0.25.0/go.mod h1:WEdwpYrmk1qmdHvhkSTNPm3app7v4rsT8F2UD6+VHIA=
 golang.org/x/time v0.9.0 h1:EsRrnYcQiGH+5FfbgvV4AP7qEZstoyrHB0DzarOQ4ZY=
 golang.org/x/time v0.9.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=
 golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
@@ -318,8 +332,8 @@ golang.org/x/tools v0.0.0-20191012152004-8de300cfc20a/go.mod h1:b+2E5dAYhXwXZwtn
 golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
 golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
 golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
-golang.org/x/tools v0.31.0 h1:0EedkvKDbh+qistFTd0Bcwe/YLh4vHwWEkiI0toFIBU=
-golang.org/x/tools v0.31.0/go.mod h1:naFTU+Cev749tSJRXJlna0T3WxKvb1kWEx15xA4SdmQ=
+golang.org/x/tools v0.33.0 h1:4qz2S3zmRxbGIhDIAgjxvFutSvH5EfnsYrRBj0UI0bc=
+golang.org/x/tools v0.33.0/go.mod h1:CIJMaWEY88juyUfo7UbgPqbC8rU2OqfAV1h2Qp0oMYI=
 golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
@@ -348,10 +362,13 @@ gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc=
 gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw=
 gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA=
 gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k=
+gopkg.in/natefinch/lumberjack.v2 v2.2.1 h1:bBRl1b0OH9s/DuPhuXpNl+VtCaJXFZ5/uEFST95x9zc=
+gopkg.in/natefinch/lumberjack.v2 v2.2.1/go.mod h1:YD8tP3GAjkrDg1eZH7EGmyESg/lsYskCTPBJVb9jqSc=
 gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
 gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
 gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
 gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.0/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
@@ -373,8 +390,8 @@ k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk=
 k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE=
 k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff h1:/usPimJzUKKu+m+TE36gUyGcf03XZEP0ZIKgKj35LS4=
 k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff/go.mod h1:5jIi+8yX4RIb8wk3XwBo5Pq2ccx4FP10ohkbSKCZoK8=
-k8s.io/utils v0.0.0-20241210054802-24370beab758 h1:sdbE21q2nlQtFh65saZY+rRM6x6aJJI8IUa1AmH/qa0=
-k8s.io/utils v0.0.0-20241210054802-24370beab758/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
+k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979 h1:jgJW5IePPXLGB8e/1wvd0Ich9QE97RvvF3a8J3fP/Lg=
+k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
 nullprogram.com/x/optparse v1.0.0/go.mod h1:KdyPE+Igbe0jQUrVfMqDMeJQIJZEuyV7pjYmp6pbG50=
 rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=
 sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 h1:jpcvIRr3GLoUoEKRkHKSmGjxb6lWwrBlJsXc+eUYQHM=
diff --git a/internal/alert/evaluation.go b/internal/alert/evaluation.go
new file mode 100644
index 00000000..6b05069f
--- /dev/null
+++ b/internal/alert/evaluation.go
@@ -0,0 +1,4 @@
+package alert
+
+// connect TSDB, eval every minute of all rules to generate alerts in mem, and check existing alert resolved or not
+// send alerts API to alertmanager, let alertmanager to do deduplication, notification stuff
diff --git a/internal/alert/rules.go b/internal/alert/rules.go
new file mode 100644
index 00000000..b5d89882
--- /dev/null
+++ b/internal/alert/rules.go
@@ -0,0 +1,4 @@
+package alert
+
+// offer API for managing user configured alert rules, stored in configMap
+// offer mem synced rules for evaluation routine to use
diff --git a/internal/alert/setup.go b/internal/alert/setup.go
new file mode 100644
index 00000000..6f33be57
--- /dev/null
+++ b/internal/alert/setup.go
@@ -0,0 +1,9 @@
+package alert
+
+// offer API to install/update prometheus alertmanager with configMap and values from configuration with a single statefulSet
+// let user to manage and upgrade alertmanager by themselves
+// wrap notification configurations and change config map then trigger reload like prometheus operator does (if not install by tensor-fusion, let user to use AlertManagerConfig by themselves, tensor-fusion will only trigger alert to pre-configured alertmanager endpoint)
+
+// use config map to manager alertmanager config
+
+// TODO:
diff --git a/internal/cloudprovider/common/utils.go b/internal/cloudprovider/common/utils.go
index ec1345f7..6b29a058 100644
--- a/internal/cloudprovider/common/utils.go
+++ b/internal/cloudprovider/common/utils.go
@@ -8,9 +8,10 @@ import (
 	"strings"
 	"time"
 
+	"math/rand"
+
 	tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
 	"github.com/NexusGPU/tensor-fusion/internal/cloudprovider/types"
-	"golang.org/x/exp/rand"
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
 	"sigs.k8s.io/controller-runtime/pkg/log"
@@ -205,7 +206,8 @@ func contains(slice []string, item string) bool {
 
 func generateRandomString(length int) string {
 	const charset = "abcdefghijklmnopqrstuvwxyz"
-	rand.Seed(uint64(time.Now().UnixNano()))
+	source := rand.NewSource(time.Now().UnixNano())
+	rand := rand.New(source)
 
 	result := make([]byte, length)
 	for i := range result {
diff --git a/internal/config/gpupool_mock.go b/internal/config/gpupool_mock.go
index 63ca4ff9..40b2f3e9 100644
--- a/internal/config/gpupool_mock.go
+++ b/internal/config/gpupool_mock.go
@@ -4,6 +4,7 @@ import (
 	"encoding/json"
 
 	tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
+	"github.com/NexusGPU/tensor-fusion/internal/constants"
 	"github.com/samber/lo"
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/runtime"
@@ -127,4 +128,33 @@ var MockGPUPoolSpec = &tfv1.GPUPoolSpec{
 			},
 		},
 	},
+	QosConfig: &tfv1.QosConfig{
+		Definitions: []tfv1.QosDefinition{
+			{
+				Name: constants.QoSLevelMedium,
+			},
+			{
+				Name: constants.QoSLevelHigh,
+			},
+		},
+		DefaultQoS: constants.QoSLevelMedium,
+		Pricing: []tfv1.QosPricing{
+			{
+				Qos: constants.QoSLevelMedium,
+				Requests: tfv1.GPUResourcePricingUnit{
+					PerFP16TFlopsPerHour: "2",
+					PerGBOfVRAMPerHour:   "1",
+				},
+				LimitsOverRequestsChargingRatio: "0.5",
+			},
+			{
+				Qos: constants.QoSLevelHigh,
+				Requests: tfv1.GPUResourcePricingUnit{
+					PerFP16TFlopsPerHour: "2",
+					PerGBOfVRAMPerHour:   "1",
+				},
+				LimitsOverRequestsChargingRatio: "0.8",
+			},
+		},
+	},
 }
diff --git a/internal/constants/constants.go b/internal/constants/constants.go
index ed60ae7f..94b3f83e 100644
--- a/internal/constants/constants.go
+++ b/internal/constants/constants.go
@@ -18,8 +18,15 @@ const (
 	LabelKeyClusterOwner    = Domain + "/cluster"
 	LabelKeyNodeClass       = Domain + "/node-class"
 	LabelKeyPodTemplateHash = Domain + "/pod-template-hash"
+	LabelComponent          = Domain + "/component"
 	TrueStringValue         = "true"
 
+	ComponentClient        = "client"
+	ComponentWorker        = "worker"
+	ComponentHypervisor    = "hypervisor"
+	ComponentNodeDiscovery = "node-discovery"
+	ComponentOperator      = "operator"
+
 	GPUNodePoolIdentifierLabelPrefix = Domain + "/pool-"
 	GPUNodePoolIdentifierLabelFormat = Domain + "/pool-%s"
 	NodeDeletionMark                 = Domain + "/should-delete"
@@ -45,6 +52,11 @@ const (
 	IsLocalGPUAnnotation             = Domain + "/is-local-gpu"
 	NoStandaloneWorkerModeAnnotation = Domain + "/no-standalone-worker-mode"
 
+	GenHostPortLabel        = Domain + "/host-port"
+	GenHostPortLabelValue   = "auto"
+	GenHostPortNameLabel    = Domain + "/port-name"
+	GenPortNumberAnnotation = Domain + "/port-number"
+
 	AutoScaleLimitsAnnotation   = Domain + "/auto-limits"
 	AutoScaleRequestsAnnotation = Domain + "/auto-requests"
 	AutoScaleReplicasAnnotation = Domain + "/auto-replicas"
@@ -72,6 +84,15 @@ const (
 	WorkerPodNameEnv           = "POD_NAME"
 	NamespaceEnv               = "OPERATOR_NAMESPACE"
 	NamespaceDefaultVal        = "tensor-fusion-sys"
+
+	KubernetesHostNameLabel      = "kubernetes.io/hostname"
+	GiBToBytes                   = 1024 * 1024 * 1024
+	HypervisorServiceAccountName = "tensor-fusion-hypervisor-sa"
+
+	QoSLevelLow      = "low"
+	QoSLevelMedium   = "medium"
+	QoSLevelHigh     = "high"
+	QoSLevelCritical = "critical"
 )
 
 const (
@@ -123,3 +144,8 @@ const (
 const TFDataPath = "/tmp/tensor-fusion/data"
 const DataVolumeName = "tf-data"
 const TensorFusionPoolManualCompaction = Domain + "/manual-compaction"
+
+const (
+	LeaderInfoConfigMapName        = "tensor-fusion-operator-leader-info"
+	LeaderInfoConfigMapLeaderIPKey = "leader-ip"
+)
diff --git a/internal/controller/gpu_controller.go b/internal/controller/gpu_controller.go
index 1c11df63..79961272 100644
--- a/internal/controller/gpu_controller.go
+++ b/internal/controller/gpu_controller.go
@@ -88,6 +88,11 @@ func (r *GPUReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R
 		return ctrl.Result{}, fmt.Errorf("node %s is not assigned to any pool", gpunode.Name)
 	}
 
+	// No need to calculate patch since GPU's owner pool not changed
+	if gpu.Labels != nil && gpu.Labels[constants.GpuPoolKey] == poolName {
+		return ctrl.Result{}, nil
+	}
+
 	patch := client.MergeFrom(gpu.DeepCopy())
 	if gpu.Labels == nil {
 		gpu.Labels = make(map[string]string)
diff --git a/internal/controller/gpunode_controller.go b/internal/controller/gpunode_controller.go
index ea28e9e6..b858b95c 100644
--- a/internal/controller/gpunode_controller.go
+++ b/internal/controller/gpunode_controller.go
@@ -26,13 +26,12 @@ import (
 	cloudprovider "github.com/NexusGPU/tensor-fusion/internal/cloudprovider"
 	"github.com/NexusGPU/tensor-fusion/internal/cloudprovider/types"
 	"github.com/NexusGPU/tensor-fusion/internal/constants"
+	"github.com/NexusGPU/tensor-fusion/internal/gpuallocator"
 	"github.com/NexusGPU/tensor-fusion/internal/metrics"
 	"github.com/NexusGPU/tensor-fusion/internal/utils"
 	batchv1 "k8s.io/api/batch/v1"
 	corev1 "k8s.io/api/core/v1"
-	"k8s.io/apimachinery/pkg/api/equality"
 	"k8s.io/apimachinery/pkg/api/errors"
-	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/client-go/tools/record"
@@ -54,6 +53,7 @@ type GPUNodeReconciler struct {
 // +kubebuilder:rbac:groups=tensor-fusion.ai,resources=gpunodes,verbs=get;list;watch;create;update;patch;delete
 // +kubebuilder:rbac:groups=tensor-fusion.ai,resources=gpunodes/status,verbs=get;update;patch
 // +kubebuilder:rbac:groups=tensor-fusion.ai,resources=gpunodes/finalizers,verbs=update
+// +kubebuilder:rbac:groups=coordination.k8s.io,resources=leases,verbs=get;list;watch;create;update;patch;delete
 
 // Reconcile GPU nodes
 func (r *GPUNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
@@ -80,6 +80,9 @@ func (r *GPUNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
 			}
 		}
 
+		// remove from metrics map
+		metrics.RemoveNodeMetrics(node.Name)
+
 		switch node.Spec.ManageMode {
 		case tfv1.GPUNodeManageModeAutoSelect:
 			// Do nothing, but if it's managed by Karpenter, should come up with some way to tell Karpenter to terminate the GPU node
@@ -215,45 +218,13 @@ func (r *GPUNodeReconciler) checkStatusAndUpdateVirtualCapacity(ctx context.Cont
 
 		return true, nil
 	} else {
-		gpuList, err := r.fetchAllOwnedGPUDevices(ctx, node)
+		gpuModels, err := gpuallocator.RefreshGPUNodeCapacity(ctx, r.Client, node, poolObj)
 		if err != nil {
 			return true, err
 		}
-		if len(gpuList) == 0 {
-			// node discovery job not completed, check again
-			return true, nil
-		}
-
-		statusCopy := node.Status.DeepCopy()
-
-		node.Status.AvailableVRAM = resource.Quantity{}
-		node.Status.AvailableTFlops = resource.Quantity{}
-		node.Status.TotalTFlops = resource.Quantity{}
-		node.Status.TotalVRAM = resource.Quantity{}
-
-		for _, gpu := range gpuList {
-			node.Status.AvailableVRAM.Add(gpu.Status.Available.Vram)
-			node.Status.AvailableTFlops.Add(gpu.Status.Available.Tflops)
-			node.Status.TotalVRAM.Add(gpu.Status.Capacity.Vram)
-			node.Status.TotalTFlops.Add(gpu.Status.Capacity.Tflops)
-		}
 
 		// update metrics to get historical allocation line chart and trending
-		metrics.AllocatedTflopsPercent.WithLabelValues(node.Status.KubernetesNodeName, poolObj.Name).Set((node.Status.TotalTFlops.AsApproximateFloat64() - node.Status.AvailableTFlops.AsApproximateFloat64()) / node.Status.TotalTFlops.AsApproximateFloat64())
-		metrics.AllocatedVramBytes.WithLabelValues(node.Status.KubernetesNodeName, poolObj.Name).Set(node.Status.TotalVRAM.AsApproximateFloat64() - node.Status.AvailableVRAM.AsApproximateFloat64())
-
-		virtualVRAM, virtualTFlops := r.CalculateVirtualCapacity(node, poolObj)
-		node.Status.VirtualTFlops = virtualTFlops
-		node.Status.VirtualVRAM = virtualVRAM
-
-		node.Status.Phase = tfv1.TensorFusionGPUNodePhaseRunning
-
-		if !equality.Semantic.DeepEqual(node.Status, statusCopy) {
-			err = r.Status().Update(ctx, node)
-			if err != nil {
-				return true, fmt.Errorf("failed to update GPU node status: %w", err)
-			}
-		}
+		metrics.SetNodeMetrics(node, poolObj, gpuModels)
 
 		err = r.syncStatusToGPUDevices(ctx, node, tfv1.TensorFusionGPUPhaseRunning)
 		if err != nil {
@@ -305,21 +276,24 @@ func (r *GPUNodeReconciler) reconcileNodeDiscoveryJob(
 	if err != nil {
 		return fmt.Errorf("unmarshal pod template: %w", err)
 	}
-
-	templateCopy := podTmpl.Template.DeepCopy()
-	if templateCopy.Spec.Affinity == nil {
-		templateCopy.Spec.Affinity = &corev1.Affinity{}
+	tmpl := podTmpl.Template
+	if tmpl.Labels == nil {
+		tmpl.Labels = map[string]string{}
 	}
-	if templateCopy.Spec.Affinity.NodeAffinity == nil {
-		templateCopy.Spec.Affinity.NodeAffinity = &corev1.NodeAffinity{}
+	tmpl.Labels[constants.LabelComponent] = constants.ComponentNodeDiscovery
+	if tmpl.Spec.Affinity == nil {
+		tmpl.Spec.Affinity = &corev1.Affinity{}
 	}
-	if templateCopy.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution == nil {
-		templateCopy.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution = &corev1.NodeSelector{
+	if tmpl.Spec.Affinity.NodeAffinity == nil {
+		tmpl.Spec.Affinity.NodeAffinity = &corev1.NodeAffinity{}
+	}
+	if tmpl.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution == nil {
+		tmpl.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution = &corev1.NodeSelector{
 			NodeSelectorTerms: make([]corev1.NodeSelectorTerm, 0),
 		}
 	}
-	templateCopy.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms =
-		append(templateCopy.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms, corev1.NodeSelectorTerm{
+	tmpl.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms =
+		append(tmpl.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms, corev1.NodeSelectorTerm{
 			MatchFields: []corev1.NodeSelectorRequirement{
 				{
 					Key:      "metadata.name",
@@ -329,19 +303,19 @@ func (r *GPUNodeReconciler) reconcileNodeDiscoveryJob(
 			},
 		})
 	// allow job to run at any taint Nodes that marked as NoSchedule
-	if templateCopy.Spec.Tolerations == nil {
-		templateCopy.Spec.Tolerations = []corev1.Toleration{}
+	if tmpl.Spec.Tolerations == nil {
+		tmpl.Spec.Tolerations = []corev1.Toleration{}
 	}
-	templateCopy.Spec.Tolerations = append(templateCopy.Spec.Tolerations, corev1.Toleration{
+	tmpl.Spec.Tolerations = append(tmpl.Spec.Tolerations, corev1.Toleration{
 		Key:      "NoSchedule",
 		Operator: corev1.TolerationOpExists,
 	})
 
-	if len(templateCopy.Spec.Containers) > 0 {
-		if len(templateCopy.Spec.Containers[0].Env) == 0 {
-			templateCopy.Spec.Containers[0].Env = []corev1.EnvVar{}
+	if len(tmpl.Spec.Containers) > 0 {
+		if len(tmpl.Spec.Containers[0].Env) == 0 {
+			tmpl.Spec.Containers[0].Env = []corev1.EnvVar{}
 		}
-		templateCopy.Spec.Containers[0].Env = append(templateCopy.Spec.Containers[0].Env, corev1.EnvVar{
+		tmpl.Spec.Containers[0].Env = append(tmpl.Spec.Containers[0].Env, corev1.EnvVar{
 			Name:  constants.NodeDiscoveryReportGPUNodeEnvName,
 			Value: gpunode.Name,
 		})
@@ -350,12 +324,14 @@ func (r *GPUNodeReconciler) reconcileNodeDiscoveryJob(
 	// create node-discovery job
 	job := &batchv1.Job{
 		ObjectMeta: metav1.ObjectMeta{
-			Name:      getDiscoveryJobName(gpunode.Name),
-			Namespace: utils.CurrentNamespace(),
+			Name:        getDiscoveryJobName(gpunode.Name),
+			Namespace:   utils.CurrentNamespace(),
+			Labels:      tmpl.Labels,
+			Annotations: tmpl.Annotations,
 		},
 		Spec: batchv1.JobSpec{
 			TTLSecondsAfterFinished: ptr.To[int32](3600 * 10),
-			Template:                *templateCopy,
+			Template:                tmpl,
 		},
 	}
 	if err := r.Get(ctx, client.ObjectKeyFromObject(job), job); err != nil {
@@ -367,7 +343,7 @@ func (r *GPUNodeReconciler) reconcileNodeDiscoveryJob(
 				return fmt.Errorf("create node discovery job %w", err)
 			}
 		} else {
-			return fmt.Errorf("create node job %w", err)
+			return fmt.Errorf("create node discovery job %w", err)
 		}
 	}
 
@@ -432,7 +408,7 @@ func (r *GPUNodeReconciler) createHypervisorPod(ctx context.Context, key client.
 	if err != nil {
 		return fmt.Errorf("failed to unmarshal pod template: %w", err)
 	}
-	spec := podTmpl.Template.Spec.DeepCopy()
+	spec := podTmpl.Template.Spec
 	if spec.NodeSelector == nil {
 		spec.NodeSelector = make(map[string]string)
 	}
@@ -450,16 +426,24 @@ func (r *GPUNodeReconciler) createHypervisorPod(ctx context.Context, key client.
 		ReadOnly:  false,
 		MountPath: constants.TFDataPath,
 	})
+	spec.ServiceAccountName = constants.HypervisorServiceAccountName
 	newPod := &corev1.Pod{
 		ObjectMeta: metav1.ObjectMeta{
 			Name:      key.Name,
 			Namespace: key.Namespace,
-			Labels: map[string]string{
-				fmt.Sprintf(constants.GPUNodePoolIdentifierLabelFormat, pool.Name): "true",
-				constants.LabelKeyPodTemplateHash:                                  utils.GetObjectHash(pool.Spec.ComponentConfig.Hypervisor),
-			},
+			Labels: func() map[string]string {
+				mergedLabels := make(map[string]string)
+				for k, v := range podTmpl.Template.Labels {
+					mergedLabels[k] = v
+				}
+				mergedLabels[fmt.Sprintf(constants.GPUNodePoolIdentifierLabelFormat, pool.Name)] = "true"
+				mergedLabels[constants.LabelKeyPodTemplateHash] = utils.GetObjectHash(pool.Spec.ComponentConfig.Hypervisor)
+				mergedLabels[constants.LabelComponent] = constants.ComponentHypervisor
+				return mergedLabels
+			}(),
+			Annotations: podTmpl.Template.Annotations,
 		},
-		Spec: *spec,
+		Spec: spec,
 	}
 
 	if newPod.Spec.Tolerations == nil {
@@ -579,31 +563,12 @@ func (r *GPUNodeReconciler) reconcileCloudVendorNode(ctx context.Context, node *
 	return nil
 }
 
-func (r *GPUNodeReconciler) CalculateVirtualCapacity(node *tfv1.GPUNode, pool *tfv1.GPUPool) (resource.Quantity, resource.Quantity) {
-	diskSize, _ := node.Status.NodeInfo.DataDiskSize.AsInt64()
-	ramSize, _ := node.Status.NodeInfo.RAMSize.AsInt64()
-
-	virtualVRAM := node.Status.TotalVRAM.DeepCopy()
-	// TODO: panic if not set TFlopsOversellRatio
-	vTFlops := node.Status.TotalTFlops.AsApproximateFloat64() * (float64(pool.Spec.CapacityConfig.Oversubscription.TFlopsOversellRatio) / 100.0)
-
-	virtualVRAM.Add(*resource.NewQuantity(
-		int64(float64(float64(diskSize)*float64(pool.Spec.CapacityConfig.Oversubscription.VRAMExpandToHostDisk)/100.0)),
-		resource.DecimalSI),
-	)
-	virtualVRAM.Add(*resource.NewQuantity(
-		int64(float64(float64(ramSize)*float64(pool.Spec.CapacityConfig.Oversubscription.VRAMExpandToHostMem)/100.0)),
-		resource.DecimalSI),
-	)
-
-	return virtualVRAM, *resource.NewQuantity(int64(vTFlops), resource.DecimalSI)
-}
-
 // SetupWithManager sets up the controller with the Manager.
 func (r *GPUNodeReconciler) SetupWithManager(mgr ctrl.Manager) error {
 	return ctrl.NewControllerManagedBy(mgr).
 		For(&tfv1.GPUNode{}).
 		Named("gpunode").
+		// TODO: should not own node, let node_claim_controller to own node for cloud vendor VM nodes,
 		Owns(&corev1.Node{}).
 		Owns(&batchv1.Job{}).
 		Owns(&corev1.Pod{}).
diff --git a/internal/controller/gpunode_controller_test.go b/internal/controller/gpunode_controller_test.go
index 15b370d7..18ceec48 100644
--- a/internal/controller/gpunode_controller_test.go
+++ b/internal/controller/gpunode_controller_test.go
@@ -41,7 +41,7 @@ var _ = Describe("GPUNode Controller", func() {
 			By("checking that the k8s node name should be set")
 			Eventually(func(g Gomega) {
 				g.Expect(gpuNode.Status.KubernetesNodeName).Should(Equal(gpuNode.Name))
-			}, timeout, interval).Should(Succeed())
+			}).Should(Succeed())
 
 			By("checking that the node discovery job is created")
 			Eventually(func(g Gomega) {
@@ -52,7 +52,7 @@ var _ = Describe("GPUNode Controller", func() {
 				}, job)).Should(Succeed())
 
 				g.Expect(job.Spec.TTLSecondsAfterFinished).Should(Equal(ptr.To[int32](3600 * 10)))
-			}, timeout, interval).Should(Succeed())
+			}).Should(Succeed())
 
 			By("checking that the hypervisor pod is created")
 			pod := &corev1.Pod{}
@@ -63,13 +63,13 @@ var _ = Describe("GPUNode Controller", func() {
 				}, pod)
 				g.Expect(err).ShouldNot(HaveOccurred())
 				g.Expect(pod.Status.Phase).Should(Equal(corev1.PodRunning))
-			}, timeout, interval).Should(Succeed())
+			}).Should(Succeed())
 
 			By("checking that the gpunode status phase should be running")
 			Eventually(func(g Gomega) {
 				gpunode := tfEnv.GetGPUNode(0, 0)
 				g.Expect(gpunode.Status.Phase).Should(Equal(tfv1.TensorFusionGPUNodePhaseRunning))
-			}, timeout, interval).Should(Succeed())
+			}).Should(Succeed())
 
 			By("checking the hypervisor pod should be recreated when enters terminated status")
 			pod.Status.Phase = corev1.PodFailed
@@ -82,7 +82,7 @@ var _ = Describe("GPUNode Controller", func() {
 				}, newPod)
 				g.Expect(err).ShouldNot(HaveOccurred())
 				g.Expect(newPod.UID).ShouldNot(Equal(pod.UID))
-			}, timeout, interval).Should(Succeed())
+			}).Should(Succeed())
 
 			tfEnv.Cleanup()
 
diff --git a/internal/controller/gpupool_controller.go b/internal/controller/gpupool_controller.go
index 257c94cf..dc9982d6 100644
--- a/internal/controller/gpupool_controller.go
+++ b/internal/controller/gpupool_controller.go
@@ -159,6 +159,9 @@ func (r *GPUPoolReconciler) reconcilePoolCurrentCapacityAndReadiness(ctx context
 	virtualAvailableVRAM := resource.Quantity{}
 	virtualAvailableTFlops := resource.Quantity{}
 
+	runningAppsCnt := int32(0)
+	deduplicationMap := make(map[string]struct{})
+
 	for _, node := range nodes.Items {
 		totalGPUs = totalGPUs + node.Status.TotalGPUs
 		totalVRAM.Add(node.Status.TotalVRAM)
@@ -178,6 +181,13 @@ func (r *GPUPoolReconciler) reconcilePoolCurrentCapacityAndReadiness(ctx context
 		if node.Status.VirtualAvailableTFlops != nil {
 			virtualAvailableTFlops.Add(*node.Status.VirtualAvailableTFlops)
 		}
+
+		for _, runningApp := range node.Status.AllocationInfo {
+			if _, ok := deduplicationMap[runningApp.Name+"_"+runningApp.Namespace]; !ok {
+				runningAppsCnt++
+				deduplicationMap[runningApp.Name+"_"+runningApp.Namespace] = struct{}{}
+			}
+		}
 	}
 
 	pool.Status.TotalGPUs = totalGPUs
@@ -196,6 +206,8 @@ func (r *GPUPoolReconciler) reconcilePoolCurrentCapacityAndReadiness(ctx context
 	pool.Status.VirtualTFlops = virtualTFlops
 	pool.Status.VirtualVRAM = virtualVRAM
 
+	pool.Status.RunningAppsCnt = runningAppsCnt
+
 	allowScaleToZero := true
 	if pool.Spec.CapacityConfig != nil && pool.Spec.CapacityConfig.MinResources != nil {
 		minTFlops, _ := pool.Spec.CapacityConfig.MinResources.TFlops.AsInt64()
diff --git a/internal/controller/gpupool_controller_test.go b/internal/controller/gpupool_controller_test.go
index bfbfd728..49e276d2 100644
--- a/internal/controller/gpupool_controller_test.go
+++ b/internal/controller/gpupool_controller_test.go
@@ -19,6 +19,7 @@ package controller
 import (
 	"encoding/json"
 	"fmt"
+	"time"
 
 	tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
 	"github.com/NexusGPU/tensor-fusion/internal/constants"
@@ -42,7 +43,7 @@ var _ = Describe("GPUPool Controller", func() {
 			Eventually(func(g Gomega) {
 				pool := tfEnv.GetGPUPool(0)
 				g.Expect(pool.Status.Phase).Should(Equal(tfv1.TensorFusionPoolPhaseRunning))
-			}, timeout, interval).Should(Succeed())
+			}).Should(Succeed())
 			tfEnv.Cleanup()
 		})
 	})
@@ -58,7 +59,7 @@ var _ = Describe("GPUPool Controller", func() {
 				g.Expect(pool.Status.ComponentStatus.HypervisorVersion).To(Equal(oldHash))
 				g.Expect(pool.Status.ComponentStatus.HyperVisorUpdateProgress).To(BeZero())
 				g.Expect(pool.Status.ComponentStatus.HypervisorConfigSynced).To(BeFalse())
-			}, timeout, interval).Should(Succeed())
+			}).Should(Succeed())
 
 			By("verifying hypervisor version should be updated upon configuration changes")
 			updateHypervisorConfig(tfEnv)
@@ -69,7 +70,7 @@ var _ = Describe("GPUPool Controller", func() {
 				g.Expect(pool.Status.ComponentStatus.HypervisorVersion).To(Equal(newHash))
 				g.Expect(pool.Status.ComponentStatus.HyperVisorUpdateProgress).To(BeZero())
 				g.Expect(pool.Status.ComponentStatus.HypervisorConfigSynced).To(BeFalse())
-			}, timeout, interval).Should(Succeed())
+			}).Should(Succeed())
 
 			tfEnv.Cleanup()
 		})
@@ -79,7 +80,7 @@ var _ = Describe("GPUPool Controller", func() {
 				AddPoolWithNodeCount(1).
 				SetGpuCountPerNode(1).
 				Build()
-			updateRollingUpdatePolicy(tfEnv, false, 100, "3s")
+			updateRollingUpdatePolicy(tfEnv, false, 100, "1s")
 			_, oldHash := triggerHypervisorUpdate(tfEnv)
 			verifyAllHypervisorPodHashConsistently(tfEnv, oldHash)
 			tfEnv.Cleanup()
@@ -99,7 +100,7 @@ var _ = Describe("GPUPool Controller", func() {
 			verifyHypervisorUpdateProgressConsistently(tfEnv, 50)
 
 			By("changing the batch inteval to trigger next update batch")
-			updateRollingUpdatePolicy(tfEnv, true, 50, "3s")
+			updateRollingUpdatePolicy(tfEnv, true, 50, "1s")
 			verifyHypervisorPodHash(tfEnv.GetGPUNode(0, 1), newHash)
 			verifyHypervisorUpdateProgress(tfEnv, 100)
 
@@ -128,7 +129,7 @@ var _ = Describe("GPUPool Controller", func() {
 				AddPoolWithNodeCount(2).
 				SetGpuCountPerNode(1).
 				Build()
-			updateRollingUpdatePolicy(tfEnv, true, 50, "3s")
+			updateRollingUpdatePolicy(tfEnv, true, 50, "1s")
 			newHash, _ := triggerHypervisorUpdate(tfEnv)
 			verifyHypervisorPodHash(tfEnv.GetGPUNode(0, 0), newHash)
 			verifyHypervisorPodHash(tfEnv.GetGPUNode(0, 1), newHash)
@@ -136,24 +137,12 @@ var _ = Describe("GPUPool Controller", func() {
 			tfEnv.Cleanup()
 		})
 
-		// It("Should perform update according to non-divisible batch percentage", func() {
-		// 	tfEnv := NewTensorFusionEnvBuilder().
-		// 		AddPoolWithNodeCount(3).
-		// 		SetGpuCountPerNode(1).
-		// 		Build()
-		// 	updateRollingUpdatePolicy(tfEnv, true, 66, "3s")
-		// 	newHash, _ := triggerHypervisorUpdate(tfEnv)
-		// 	verifyAllHypervisorPodHash(tfEnv, newHash)
-		// 	verifyHypervisorUpdateProgress(tfEnv, 100)
-		// 	tfEnv.Cleanup()
-		// })
-
 		It("Should update all nodes at once if BatchPercentage is 100", func() {
 			tfEnv := NewTensorFusionEnvBuilder().
 				AddPoolWithNodeCount(3).
 				SetGpuCountPerNode(1).
 				Build()
-			updateRollingUpdatePolicy(tfEnv, true, 100, "3s")
+			updateRollingUpdatePolicy(tfEnv, true, 100, "1s")
 			newHash, _ := triggerHypervisorUpdate(tfEnv)
 			verifyAllHypervisorPodHash(tfEnv, newHash)
 			verifyHypervisorUpdateProgress(tfEnv, 100)
@@ -172,7 +161,7 @@ var _ = Describe("GPUPool Controller", func() {
 				g.Expect(pool.Status.ComponentStatus.WorkerVersion).To(Equal(oldHash))
 				g.Expect(pool.Status.ComponentStatus.WorkerUpdateProgress).To(BeZero())
 				g.Expect(pool.Status.ComponentStatus.WorkerConfigSynced).To(BeFalse())
-			}, timeout, interval).Should(Succeed())
+			}).Should(Succeed())
 
 			By("verifying worker version should be updated upon configuration changes")
 			updateWorkerConfig(tfEnv)
@@ -183,7 +172,7 @@ var _ = Describe("GPUPool Controller", func() {
 				g.Expect(pool.Status.ComponentStatus.WorkerVersion).To(Equal(newHash))
 				g.Expect(pool.Status.ComponentStatus.WorkerUpdateProgress).To(BeZero())
 				g.Expect(pool.Status.ComponentStatus.WorkerConfigSynced).To(BeFalse())
-			}, timeout, interval).Should(Succeed())
+			}).Should(Succeed())
 
 			tfEnv.Cleanup()
 		})
@@ -194,15 +183,15 @@ var _ = Describe("GPUPool Controller", func() {
 				SetGpuCountPerNode(1).
 				Build()
 
-			By("configuring a large enougth batch inteval to prevent next update batch")
+			By("configuring a large enough batch interval to prevent next update batch")
 			updateRollingUpdatePolicy(tfEnv, true, 50, "10m")
 			createWorkloads(tfEnv, 2)
 			triggerWorkerUpdate(tfEnv)
 			verifyWorkerPodContainerNameConsistently(1, "tensorfusion-worker")
 			verifyWorkerUpdateProgressConsistently(tfEnv, 50)
 
-			By("changing the batch inteval to trigger next update batch")
-			updateRollingUpdatePolicy(tfEnv, true, 50, "3s")
+			By("changing the batch interval to trigger next update batch")
+			updateRollingUpdatePolicy(tfEnv, true, 50, "1s")
 			verifyAllWorkerPodContainerName(tfEnv, "updated-name")
 			verifyWorkerUpdateProgress(tfEnv, 100)
 
@@ -215,7 +204,7 @@ var _ = Describe("GPUPool Controller", func() {
 				AddPoolWithNodeCount(1).
 				SetGpuCountPerNode(2).
 				Build()
-			updateRollingUpdatePolicy(tfEnv, true, 50, "3s")
+			updateRollingUpdatePolicy(tfEnv, true, 50, "1s")
 			createWorkloads(tfEnv, 2)
 			triggerWorkerUpdate(tfEnv)
 			verifyAllWorkerPodContainerName(tfEnv, "updated-name")
@@ -229,7 +218,7 @@ var _ = Describe("GPUPool Controller", func() {
 				AddPoolWithNodeCount(1).
 				SetGpuCountPerNode(2).
 				Build()
-			updateRollingUpdatePolicy(tfEnv, true, 100, "3s")
+			updateRollingUpdatePolicy(tfEnv, true, 100, "1s")
 			createWorkloads(tfEnv, 2)
 			triggerWorkerUpdate(tfEnv)
 			verifyAllWorkerPodContainerName(tfEnv, "updated-name")
@@ -248,7 +237,7 @@ var _ = Describe("GPUPool Controller", func() {
 				g.Expect(pool.Status.ComponentStatus.ClientVersion).To(Equal(oldHash))
 				g.Expect(pool.Status.ComponentStatus.ClientUpdateProgress).To(BeZero())
 				g.Expect(pool.Status.ComponentStatus.ClientConfigSynced).To(BeFalse())
-			}, timeout, interval).Should(Succeed())
+			}).Should(Succeed())
 
 			By("verifying client version should be updated upon configuration changes")
 			updateClientConfig(tfEnv)
@@ -259,7 +248,7 @@ var _ = Describe("GPUPool Controller", func() {
 				g.Expect(pool.Status.ComponentStatus.ClientVersion).To(Equal(newHash))
 				g.Expect(pool.Status.ComponentStatus.ClientUpdateProgress).To(BeZero())
 				g.Expect(pool.Status.ComponentStatus.ClientConfigSynced).To(BeFalse())
-			}, timeout, interval).Should(Succeed())
+			}).Should(Succeed())
 
 			tfEnv.Cleanup()
 		})
@@ -389,7 +378,7 @@ func updateRollingUpdatePolicy(tfEnv *TensorFusionEnv, autoUpdate bool, batchPer
 		g.Expect(newPolicy.AutoUpdate).Should(Equal(policy.AutoUpdate))
 		g.Expect(newPolicy.BatchPercentage).Should(Equal(policy.BatchPercentage))
 		g.Expect(newPolicy.BatchInterval).Should(Equal(policy.BatchInterval))
-	}, timeout, interval).Should(Succeed())
+	}).Should(Succeed())
 }
 
 func verifyGpuPoolClientHash(tfEnv *TensorFusionEnv, oldHash string) string {
@@ -400,7 +389,7 @@ func verifyGpuPoolClientHash(tfEnv *TensorFusionEnv, oldHash string) string {
 		newHash := utils.GetObjectHash(pool.Spec.ComponentConfig.Client)
 		g.Expect(newHash).ShouldNot(Equal(oldHash))
 		g.Expect(pool.Status.ComponentStatus.ClientVersion).To(Equal(newHash))
-	}, timeout, interval).Should(Succeed())
+	}).Should(Succeed())
 
 	return pool.Status.ComponentStatus.ClientVersion
 }
@@ -413,7 +402,7 @@ func verifyGpuPoolHypervisorHash(tfEnv *TensorFusionEnv, oldHash string) string
 		newHash := utils.GetObjectHash(pool.Spec.ComponentConfig.Hypervisor)
 		g.Expect(newHash).ShouldNot(Equal(oldHash))
 		g.Expect(pool.Status.ComponentStatus.HypervisorVersion).To(Equal(newHash))
-	}, timeout, interval).Should(Succeed())
+	}).Should(Succeed())
 
 	return pool.Status.ComponentStatus.HypervisorVersion
 }
@@ -426,7 +415,7 @@ func verifyGpuPoolWorkerHash(tfEnv *TensorFusionEnv, oldHash string) string {
 		newHash := utils.GetObjectHash(pool.Spec.ComponentConfig.Worker)
 		g.Expect(newHash).ShouldNot(Equal(oldHash))
 		g.Expect(pool.Status.ComponentStatus.WorkerVersion).To(Equal(newHash))
-	}, timeout, interval).Should(Succeed())
+	}).Should(Succeed())
 
 	return pool.Status.ComponentStatus.WorkerVersion
 }
@@ -441,7 +430,7 @@ func verifyHypervisorPodHash(gpuNode *tfv1.GPUNode, hash string) {
 		}, pod)).Should(Succeed())
 		g.Expect(pod.Labels[constants.LabelKeyPodTemplateHash]).Should(Equal(hash))
 		updatePodPhaseToRunning(pod, hash)
-	}, timeout, interval).Should(Succeed())
+	}).Should(Succeed())
 }
 
 func verifyClientPodHash(index int, hash string) {
@@ -451,7 +440,7 @@ func verifyClientPodHash(index int, hash string) {
 		key := client.ObjectKey{Namespace: utils.CurrentNamespace(), Name: getClientPodName(index)}
 		g.Expect(k8sClient.Get(ctx, key, pod)).Should(Succeed())
 		g.Expect(pod.Labels[constants.LabelKeyPodTemplateHash]).Should(Equal(hash))
-	}, timeout, interval).Should(Succeed())
+	}).Should(Succeed())
 }
 
 func verifyClientPodHashConsistently(index int, hash string) {
@@ -461,7 +450,7 @@ func verifyClientPodHashConsistently(index int, hash string) {
 		key := client.ObjectKey{Namespace: utils.CurrentNamespace(), Name: getClientPodName(index)}
 		g.Expect(k8sClient.Get(ctx, key, pod)).Should(Succeed())
 		g.Expect(pod.Labels[constants.LabelKeyPodTemplateHash]).Should(Equal(hash))
-	}, duration, interval).Should(Succeed())
+	}).Should(Succeed())
 }
 
 func verifyHypervisorPodHashConsistently(gpuNode *tfv1.GPUNode, hash string) {
@@ -474,7 +463,7 @@ func verifyHypervisorPodHashConsistently(gpuNode *tfv1.GPUNode, hash string) {
 		}, pod)).Should(Succeed())
 		g.Expect(pod.Labels[constants.LabelKeyPodTemplateHash]).Should(Equal(hash))
 		updatePodPhaseToRunning(pod, hash)
-	}, duration, interval).Should(Succeed())
+	}).Should(Succeed())
 }
 
 func verifyClientPodWasDeleted(index int) {
@@ -482,7 +471,7 @@ func verifyClientPodWasDeleted(index int) {
 		pod := &corev1.Pod{}
 		key := client.ObjectKey{Namespace: utils.CurrentNamespace(), Name: getClientPodName(index)}
 		g.Expect(k8sClient.Get(ctx, key, pod)).ShouldNot(Succeed())
-	}, timeout, interval).Should(Succeed())
+	}).Should(Succeed())
 }
 
 func verifyAllHypervisorPodHash(tfEnv *TensorFusionEnv, hash string) {
@@ -499,7 +488,7 @@ func verifyAllHypervisorPodHash(tfEnv *TensorFusionEnv, hash string) {
 			g.Expect(pod.Labels[constants.LabelKeyPodTemplateHash]).Should(Equal(hash))
 			updatePodPhaseToRunning(pod, hash)
 		}
-	}, timeout, interval).Should(Succeed())
+	}).Should(Succeed())
 }
 
 // func verifyWorkerPodContainerName(workloadIndex int, name string) {
@@ -513,7 +502,7 @@ func verifyAllHypervisorPodHash(tfEnv *TensorFusionEnv, hash string) {
 // 		for _, pod := range podList.Items {
 // 			g.Expect(pod.Spec.Containers[0].Name).Should(Equal(name))
 // 		}
-// 	}, timeout, interval).Should(Succeed())
+// 	}).Should(Succeed())
 // }
 
 func verifyWorkerPodContainerNameConsistently(workloadIndex int, name string) {
@@ -527,7 +516,7 @@ func verifyWorkerPodContainerNameConsistently(workloadIndex int, name string) {
 		for _, pod := range podList.Items {
 			g.Expect(pod.Spec.Containers[0].Name).Should(Equal(name))
 		}
-	}, duration, interval).Should(Succeed())
+	}, 1*time.Second).Should(Succeed())
 }
 
 func verifyAllWorkerPodContainerName(tfEnv *TensorFusionEnv, name string) {
@@ -549,7 +538,7 @@ func verifyAllWorkerPodContainerName(tfEnv *TensorFusionEnv, name string) {
 			}
 		}
 
-	}, timeout, interval).Should(Succeed())
+	}).Should(Succeed())
 }
 
 func verifyAllHypervisorPodHashConsistently(tfEnv *TensorFusionEnv, hash string) {
@@ -565,7 +554,7 @@ func verifyAllHypervisorPodHashConsistently(tfEnv *TensorFusionEnv, hash string)
 			g.Expect(pod.Labels[constants.LabelKeyPodTemplateHash]).Should(Equal(hash))
 			updatePodPhaseToRunning(pod, hash)
 		}
-	}, duration, interval).Should(Succeed())
+	}).Should(Succeed())
 }
 
 // func verifyAllWorkerPodContainerNameConsistently(tfEnv *TensorFusionEnv, name string) {
@@ -600,7 +589,7 @@ func verifyHypervisorUpdateProgress(tfEnv *TensorFusionEnv, progress int32) {
 		} else {
 			g.Expect(pool.Status.ComponentStatus.HypervisorConfigSynced).To(BeFalse())
 		}
-	}, timeout, interval).Should(Succeed())
+	}).Should(Succeed())
 }
 
 func verifyWorkerUpdateProgress(tfEnv *TensorFusionEnv, progress int32) {
@@ -613,7 +602,7 @@ func verifyWorkerUpdateProgress(tfEnv *TensorFusionEnv, progress int32) {
 		} else {
 			g.Expect(pool.Status.ComponentStatus.WorkerConfigSynced).To(BeFalse())
 		}
-	}, timeout, interval).Should(Succeed())
+	}).Should(Succeed())
 }
 
 func verifyClientUpdateProgress(tfEnv *TensorFusionEnv, progress int32) {
@@ -626,7 +615,7 @@ func verifyClientUpdateProgress(tfEnv *TensorFusionEnv, progress int32) {
 		} else {
 			g.Expect(pool.Status.ComponentStatus.ClientConfigSynced).To(BeFalse())
 		}
-	}, timeout, interval).Should(Succeed())
+	}).Should(Succeed())
 }
 
 func verifyClientUpdateProgressConsistently(tfEnv *TensorFusionEnv, progress int32) {
@@ -639,7 +628,7 @@ func verifyClientUpdateProgressConsistently(tfEnv *TensorFusionEnv, progress int
 		} else {
 			g.Expect(pool.Status.ComponentStatus.ClientConfigSynced).To(BeFalse())
 		}
-	}, duration, interval).Should(Succeed())
+	}).Should(Succeed())
 }
 
 func verifyHypervisorUpdateProgressConsistently(tfEnv *TensorFusionEnv, progress int32) {
@@ -652,7 +641,7 @@ func verifyHypervisorUpdateProgressConsistently(tfEnv *TensorFusionEnv, progress
 		} else {
 			g.Expect(pool.Status.ComponentStatus.HypervisorConfigSynced).To(BeFalse())
 		}
-	}, timeout, interval).Should(Succeed())
+	}).Should(Succeed())
 }
 
 func verifyWorkerUpdateProgressConsistently(tfEnv *TensorFusionEnv, progress int32) {
@@ -665,7 +654,7 @@ func verifyWorkerUpdateProgressConsistently(tfEnv *TensorFusionEnv, progress int
 		} else {
 			g.Expect(pool.Status.ComponentStatus.WorkerConfigSynced).To(BeFalse())
 		}
-	}, duration, interval).Should(Succeed())
+	}).Should(Succeed())
 }
 
 // no pod controller in EnvTest, need to manually update pod status
@@ -684,7 +673,7 @@ func ensureGpuPoolIsRunning(tfEnv *TensorFusionEnv) {
 	Eventually(func(g Gomega) {
 		pool := tfEnv.GetGPUPool(0)
 		g.Expect(pool.Status.Phase).Should(Equal(tfv1.TensorFusionPoolPhaseRunning))
-	}, timeout, interval).Should(Succeed())
+	}).Should(Succeed())
 }
 
 // no RepliaSet like controller in EnvTest, need to create by ourself
@@ -716,7 +705,7 @@ func createClientPodByIndex(tfEnv *TensorFusionEnv, index int) {
 		pod := &corev1.Pod{}
 		key := client.ObjectKey{Namespace: utils.CurrentNamespace(), Name: getClientPodName(index)}
 		g.Expect(k8sClient.Get(ctx, key, pod)).Should(Succeed())
-	}, timeout, interval).Should(Succeed())
+	}).Should(Succeed())
 }
 
 func createClientPods(tfEnv *TensorFusionEnv, count int) {
@@ -751,7 +740,7 @@ func createClientPods(tfEnv *TensorFusionEnv, count int) {
 			key := client.ObjectKey{Namespace: utils.CurrentNamespace(), Name: getClientPodName(i)}
 			g.Expect(k8sClient.Get(ctx, key, pod)).Should(Succeed())
 		}
-	}, timeout, interval).Should(Succeed())
+	}).Should(Succeed())
 }
 
 func cleanupClientPods() {
@@ -766,6 +755,7 @@ func createWorkloads(tfEnv *TensorFusionEnv, count int) {
 		replicas := 1
 		workload := createTensorFusionWorkload(pool.Name, key, replicas)
 		checkWorkerPodCount(workload)
+		checkWorkloadStatus(workload)
 	}
 }
 
diff --git a/internal/controller/pod_controller.go b/internal/controller/pod_controller.go
index 7dbb997c..d3eacae7 100644
--- a/internal/controller/pod_controller.go
+++ b/internal/controller/pod_controller.go
@@ -19,9 +19,11 @@ package controller
 import (
 	"context"
 	"fmt"
+	"strconv"
 
 	tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
 	"github.com/NexusGPU/tensor-fusion/internal/constants"
+	"github.com/NexusGPU/tensor-fusion/internal/portallocator"
 	"github.com/NexusGPU/tensor-fusion/internal/utils"
 	v1 "github.com/NexusGPU/tensor-fusion/internal/webhook/v1"
 	"github.com/samber/lo"
@@ -40,7 +42,8 @@ import (
 // PodReconciler reconciles a Pod object
 type PodReconciler struct {
 	client.Client
-	Scheme *runtime.Scheme
+	Scheme        *runtime.Scheme
+	PortAllocator *portallocator.PortAllocator
 }
 
 // +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch;create;update;patch;delete;deletecollection
@@ -59,6 +62,15 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R
 		log.Error(err, "Failed to get Pod")
 		return ctrl.Result{}, err
 	}
+
+	// Release cluster level port when Pod deleted
+	if !pod.DeletionTimestamp.IsZero() {
+		if pod.Annotations[constants.GenHostPortLabel] == constants.GenHostPortLabelValue {
+			podPortNumber, _ := strconv.Atoi(pod.Annotations[constants.GenPortNumberAnnotation])
+			_ = r.PortAllocator.ReleaseClusterLevelHostPort(pod.Name, podPortNumber)
+			log.Info("Released port", "pod", pod.Name, "port", podPortNumber)
+		}
+	}
 	// generate tensor fusion connections and apply to cluster
 	tfConnection := generateTensorFusionConnection(pod)
 	if tfConnection == nil {
diff --git a/internal/controller/suite_test.go b/internal/controller/suite_test.go
index 3ec5e77f..8945a295 100644
--- a/internal/controller/suite_test.go
+++ b/internal/controller/suite_test.go
@@ -35,6 +35,7 @@ import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/client-go/kubernetes/scheme"
 	"k8s.io/client-go/rest"
+	"k8s.io/client-go/util/retry"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
 	"sigs.k8s.io/controller-runtime/pkg/envtest"
@@ -47,7 +48,10 @@ import (
 	"github.com/NexusGPU/tensor-fusion/internal/config"
 	"github.com/NexusGPU/tensor-fusion/internal/constants"
 	"github.com/NexusGPU/tensor-fusion/internal/gpuallocator"
+	"github.com/NexusGPU/tensor-fusion/internal/metrics"
+	"github.com/NexusGPU/tensor-fusion/internal/portallocator"
 	"github.com/NexusGPU/tensor-fusion/internal/utils"
+	metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
 	// +kubebuilder:scaffold:imports
 )
 
@@ -60,16 +64,14 @@ var testEnv *envtest.Environment
 var ctx context.Context
 var cancel context.CancelFunc
 var allocator *gpuallocator.GpuAllocator
-
-const (
-	timeout  = time.Second * 10
-	duration = time.Second * 5
-	interval = time.Millisecond * 100
-)
+var metricsRecorder *metrics.MetricsRecorder
 
 func TestControllers(t *testing.T) {
 	RegisterFailHandler(Fail)
-
+	SetDefaultEventuallyTimeout(6 * time.Second)
+	SetDefaultEventuallyPollingInterval(200 * time.Millisecond)
+	SetDefaultConsistentlyDuration(5 * time.Second)
+	SetDefaultConsistentlyPollingInterval(200 * time.Millisecond)
 	RunSpecs(t, "Controller Suite")
 }
 
@@ -119,12 +121,25 @@ var _ = BeforeSuite(func() {
 
 	mgr, err := ctrl.NewManager(cfg, ctrl.Options{
 		Scheme: scheme.Scheme,
+		Metrics: metricsserver.Options{
+			BindAddress: "0",
+		},
 	})
 	Expect(err).ToNot(HaveOccurred())
+
+	metricsRecorder = &metrics.MetricsRecorder{
+		MetricsOutputPath: "./metrics.log",
+		HourlyUnitPriceMap: map[string]float64{
+			"A100": 10,
+		},
+		WorkerUnitPriceMap: make(map[string]map[string]metrics.RawBillingPricing),
+	}
+
 	err = (&TensorFusionClusterReconciler{
-		Client:   mgr.GetClient(),
-		Scheme:   mgr.GetScheme(),
-		Recorder: mgr.GetEventRecorderFor("TensorFusionCluster"),
+		Client:          mgr.GetClient(),
+		Scheme:          mgr.GetScheme(),
+		Recorder:        mgr.GetEventRecorderFor("TensorFusionCluster"),
+		MetricsRecorder: metricsRecorder,
 	}).SetupWithManager(mgr)
 	Expect(err).ToNot(HaveOccurred())
 
@@ -142,12 +157,11 @@ var _ = BeforeSuite(func() {
 	}).SetupWithManager(mgr)
 	Expect(err).ToNot(HaveOccurred())
 
-	// err = (&GPUPoolCompactionReconciler{
-	// 	Client:   mgr.GetClient(),
-	// 	Scheme:   mgr.GetScheme(),
-	// 	Recorder: mgr.GetEventRecorderFor("GPUPoolCompaction"),
-	// }).SetupWithManager(mgr)
-	// Expect(err).ToNot(HaveOccurred())
+	portAllocator, err := portallocator.NewPortAllocator(ctx, mgr.GetClient(), "40000-42000", "42001-60000")
+	if err != nil {
+		Expect(err).ToNot(HaveOccurred())
+	}
+	_ = portAllocator.SetupWithManager(ctx, mgr)
 
 	err = (&GPUNodeClassReconciler{
 		Client: mgr.GetClient(),
@@ -162,8 +176,9 @@ var _ = BeforeSuite(func() {
 	Expect(err).ToNot(HaveOccurred())
 
 	err = (&PodReconciler{
-		Client: mgr.GetClient(),
-		Scheme: mgr.GetScheme(),
+		Client:        mgr.GetClient(),
+		Scheme:        mgr.GetScheme(),
+		PortAllocator: portAllocator,
 	}).SetupWithManager(mgr)
 	Expect(err).ToNot(HaveOccurred())
 
@@ -180,7 +195,7 @@ var _ = BeforeSuite(func() {
 	}).SetupWithManager(mgr)
 	Expect(err).ToNot(HaveOccurred())
 
-	allocator = gpuallocator.NewGpuAllocator(ctx, mgr.GetClient(), 3*time.Second)
+	allocator = gpuallocator.NewGpuAllocator(ctx, mgr.GetClient(), 150*time.Millisecond)
 	_, err = allocator.SetupWithManager(ctx, mgr)
 	Expect(err).ToNot(HaveOccurred())
 
@@ -198,11 +213,12 @@ var _ = BeforeSuite(func() {
 	Expect(err).ToNot(HaveOccurred())
 
 	err = (&TensorFusionWorkloadReconciler{
-		Client:    mgr.GetClient(),
-		Scheme:    mgr.GetScheme(),
-		Allocator: allocator,
-		Recorder:  mgr.GetEventRecorderFor("TensorFusionWorkload"),
-		GpuInfos:  config.MockGpuInfo(),
+		Client:        mgr.GetClient(),
+		Scheme:        mgr.GetScheme(),
+		Allocator:     allocator,
+		Recorder:      mgr.GetEventRecorderFor("TensorFusionWorkload"),
+		GpuInfos:      config.MockGpuInfo(),
+		PortAllocator: portAllocator,
 	}).SetupWithManager(mgr)
 	Expect(err).ToNot(HaveOccurred())
 
@@ -232,15 +248,21 @@ type TensorFusionEnv struct {
 func (c *TensorFusionEnv) GetCluster() *tfv1.TensorFusionCluster {
 	GinkgoHelper()
 	tfc := &tfv1.TensorFusionCluster{}
-	Eventually(func(g Gomega) {
-		g.Expect(k8sClient.Get(ctx, c.clusterKey, tfc)).Should(Succeed())
-	}).Should(Succeed())
+	Expect(k8sClient.Get(ctx, c.clusterKey, tfc)).Should(Succeed())
 	return tfc
 }
 
 func (c *TensorFusionEnv) UpdateCluster(tfc *tfv1.TensorFusionCluster) {
 	GinkgoHelper()
-	Expect(k8sClient.Update(ctx, tfc)).Should(Succeed())
+	err := retry.RetryOnConflict(retry.DefaultBackoff, func() error {
+		latest := &tfv1.TensorFusionCluster{}
+		if err := k8sClient.Get(ctx, client.ObjectKeyFromObject(tfc), latest); err != nil {
+			return err
+		}
+		latest.Spec = tfc.Spec
+		return k8sClient.Update(ctx, latest)
+	})
+	Expect(err).Should(Succeed())
 }
 
 func (c *TensorFusionEnv) Cleanup() {
@@ -260,7 +282,7 @@ func (c *TensorFusionEnv) Cleanup() {
 		Eventually(func(g Gomega) {
 			pool := &tfv1.GPUPool{}
 			g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: c.getPoolName(poolIndex)}, pool)).Should(HaveOccurred())
-		}, timeout, interval).Should(Succeed())
+		}).Should(Succeed())
 		delete(c.poolNodeMap, poolIndex)
 		c.poolCount--
 	}
@@ -269,7 +291,7 @@ func (c *TensorFusionEnv) Cleanup() {
 	Eventually(func(g Gomega) {
 		err := k8sClient.Get(ctx, c.clusterKey, tfc)
 		g.Expect(err).Should(HaveOccurred())
-	}, timeout, interval).Should(Succeed())
+	}).Should(Succeed())
 }
 
 func (c *TensorFusionEnv) GetGPUPoolList() *tfv1.GPUPoolList {
@@ -280,7 +302,7 @@ func (c *TensorFusionEnv) GetGPUPoolList() *tfv1.GPUPoolList {
 			constants.LabelKeyOwner: c.clusterKey.Name,
 		}))).Should(Succeed())
 		g.Expect(poolList.Items).Should(HaveLen(c.poolCount))
-	}, timeout, interval).Should(Succeed())
+	}).Should(Succeed())
 	return poolList
 }
 
@@ -289,7 +311,7 @@ func (c *TensorFusionEnv) GetGPUPool(poolIndex int) *tfv1.GPUPool {
 	pool := &tfv1.GPUPool{}
 	Eventually(func(g Gomega) {
 		g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: c.getPoolName(poolIndex)}, pool)).Should(Succeed())
-	}, timeout, interval).Should(Succeed())
+	}).Should(Succeed())
 	return pool
 }
 
@@ -301,7 +323,7 @@ func (c *TensorFusionEnv) GetGPUNodeList(poolIndex int) *tfv1.GPUNodeList {
 			fmt.Sprintf(constants.GPUNodePoolIdentifierLabelFormat, c.getPoolName(poolIndex)): "true",
 		}))).Should(Succeed())
 		g.Expect(nodeList.Items).Should(HaveLen(len(c.poolNodeMap[poolIndex])))
-	}, timeout, interval).Should(Succeed())
+	}).Should(Succeed())
 	return nodeList
 }
 
@@ -310,7 +332,7 @@ func (c *TensorFusionEnv) GetGPUNode(poolIndex int, nodeIndex int) *tfv1.GPUNode
 	node := &tfv1.GPUNode{}
 	Eventually(func(g Gomega) {
 		g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: c.getNodeName(poolIndex, nodeIndex)}, node)).Should(Succeed())
-	}, timeout, interval).Should(Succeed())
+	}).Should(Succeed())
 	return node
 }
 
@@ -321,7 +343,7 @@ func (c *TensorFusionEnv) DeleteGPUNode(poolIndex int, nodeIndex int) {
 	Expect(k8sClient.Delete(ctx, node)).Should(Succeed())
 	Eventually(func(g Gomega) {
 		g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: c.getNodeName(poolIndex, nodeIndex)}, node)).Should(HaveOccurred())
-	}, timeout, interval).Should(Succeed())
+	}).Should(Succeed())
 	delete(c.poolNodeMap[poolIndex], nodeIndex)
 }
 
@@ -333,7 +355,7 @@ func (c *TensorFusionEnv) GetNodeGpuList(poolIndex int, nodeIndex int) *tfv1.GPU
 			constants.LabelKeyOwner: c.getNodeName(poolIndex, nodeIndex),
 		}))).Should(Succeed())
 		g.Expect(gpuList.Items).Should(HaveLen(c.poolNodeMap[poolIndex][nodeIndex]))
-	}, timeout, interval).Should(Succeed())
+	}).Should(Succeed())
 	return gpuList
 }
 
@@ -356,7 +378,7 @@ func (c *TensorFusionEnv) GetPoolGpuList(poolIndex int) *tfv1.GPUList {
 			constants.GpuPoolKey: c.getPoolName(poolIndex),
 		}))).Should(Succeed())
 		g.Expect(gpuList.Items).Should(HaveLen(poolGpuCount))
-	}, timeout, interval).Should(Succeed())
+	}).Should(Succeed())
 	return gpuList
 }
 
@@ -377,7 +399,7 @@ func (c *TensorFusionEnv) UpdateHypervisorStatus() {
 					}),
 				)).Should(Succeed())
 				g.Expect(podList.Items).Should(HaveLen(len(c.poolNodeMap[poolIndex])))
-			}, timeout, interval).Should(Succeed())
+			}).Should(Succeed())
 			for _, pod := range podList.Items {
 				pod.Status.Phase = corev1.PodRunning
 				pod.Status.Conditions = append(pod.Status.Conditions, corev1.PodCondition{Type: corev1.PodReady, Status: corev1.ConditionTrue})
@@ -453,6 +475,14 @@ func (b *TensorFusionEnvBuilder) Build() *TensorFusionEnv {
 			Name:      b.clusterKey.Name,
 			Namespace: b.clusterKey.Namespace,
 		},
+		Spec: tfv1.TensorFusionClusterSpec{
+			GPUPools: []tfv1.GPUPoolDefinition{
+				{
+					Name:         fmt.Sprintf("pool-%d", b.poolCount),
+					SpecTemplate: *config.MockGPUPoolSpec,
+				},
+			},
+		},
 	}
 
 	// construct pools
@@ -477,7 +507,7 @@ func (b *TensorFusionEnvBuilder) Build() *TensorFusionEnv {
 			constants.LabelKeyOwner: tfc.Name,
 		}))).Should(Succeed())
 		g.Expect(gpuPoolList.Items).Should(HaveLen(b.poolCount))
-	}, timeout, interval).Should(Succeed())
+	}).Should(Succeed())
 
 	// generate nodes
 	selectors := strings.Split(constants.InitialGPUNodeSelector, "=")
diff --git a/internal/controller/tensorfusioncluster_controller.go b/internal/controller/tensorfusioncluster_controller.go
index 035f8853..5eb93ed7 100644
--- a/internal/controller/tensorfusioncluster_controller.go
+++ b/internal/controller/tensorfusioncluster_controller.go
@@ -19,6 +19,7 @@ package controller
 import (
 	"context"
 	"fmt"
+	"strconv"
 	"sync"
 
 	"k8s.io/apimachinery/pkg/api/errors"
@@ -35,6 +36,7 @@ import (
 	tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
 	"github.com/NexusGPU/tensor-fusion/internal/cloudprovider"
 	"github.com/NexusGPU/tensor-fusion/internal/constants"
+	"github.com/NexusGPU/tensor-fusion/internal/metrics"
 	utils "github.com/NexusGPU/tensor-fusion/internal/utils"
 	corev1 "k8s.io/api/core/v1"
 
@@ -46,8 +48,9 @@ import (
 // TensorFusionClusterReconciler reconciles a TensorFusionCluster object
 type TensorFusionClusterReconciler struct {
 	client.Client
-	Scheme   *runtime.Scheme
-	Recorder record.EventRecorder
+	Scheme          *runtime.Scheme
+	Recorder        record.EventRecorder
+	MetricsRecorder *metrics.MetricsRecorder
 
 	LastProcessedItems sync.Map
 }
@@ -302,6 +305,7 @@ func (r *TensorFusionClusterReconciler) reconcileGPUPool(ctx context.Context, tf
 			}
 			err = r.Create(ctx, gpupool)
 			anyPoolChanged = true
+			r.updateMetricsRecorder(ctx, gpupool)
 			if err != nil {
 				errors = append(errors, fmt.Errorf("failed to create GPUPool %s: %w", key, err))
 				continue
@@ -315,6 +319,7 @@ func (r *TensorFusionClusterReconciler) reconcileGPUPool(ctx context.Context, tf
 					errors = append(errors, fmt.Errorf("failed to update GPUPool %s: %w", key, err))
 				}
 				anyPoolChanged = true
+				r.updateMetricsRecorder(ctx, existingPool)
 			}
 		}
 	}
@@ -412,3 +417,33 @@ func (r *TensorFusionClusterReconciler) SetupWithManager(mgr ctrl.Manager) error
 		Owns(&tfv1.GPUPool{}).
 		Complete(r)
 }
+
+// Update metrics recorder's raw billing map
+func (r *TensorFusionClusterReconciler) updateMetricsRecorder(ctx context.Context, pool *tfv1.GPUPool) {
+	log := log.FromContext(ctx)
+	if pool.Spec.QosConfig == nil {
+		log.Info("QosConfig is nil, skip updating metrics recorder", "pool", pool.Name)
+		return
+	}
+
+	qosConfig := pool.Spec.QosConfig
+	if _, ok := r.MetricsRecorder.WorkerUnitPriceMap[pool.Name]; !ok {
+		r.MetricsRecorder.WorkerUnitPriceMap[pool.Name] = make(map[string]metrics.RawBillingPricing)
+	}
+	pricingDetail := r.MetricsRecorder.WorkerUnitPriceMap[pool.Name]
+	for _, pricing := range qosConfig.Pricing {
+		tflopsPerHour, _ := strconv.ParseFloat(pricing.Requests.PerFP16TFlopsPerHour, 64)
+		vramPerHour, _ := strconv.ParseFloat(pricing.Requests.PerGBOfVRAMPerHour, 64)
+		limitOverRequestChargingRatio, _ := strconv.ParseFloat(pricing.LimitsOverRequestsChargingRatio, 64)
+
+		pricingDetail[string(pricing.Qos)] = metrics.RawBillingPricing{
+			TflopsPerSecond: tflopsPerHour / float64(3600),
+			VramPerSecond:   vramPerHour / float64(3600),
+
+			TflopsOverRequestPerSecond: tflopsPerHour / float64(3600) * limitOverRequestChargingRatio,
+			VramOverRequestPerSecond:   vramPerHour / float64(3600) * limitOverRequestChargingRatio,
+		}
+	}
+
+	log.V(5).Info("Updated metrics recorder", "pool", pool.Name, "pricing", pricingDetail)
+}
diff --git a/internal/controller/tensorfusionconnection_controller_test.go b/internal/controller/tensorfusionconnection_controller_test.go
index 61449da4..87f685c8 100644
--- a/internal/controller/tensorfusionconnection_controller_test.go
+++ b/internal/controller/tensorfusionconnection_controller_test.go
@@ -103,7 +103,7 @@ var _ = Describe("TensorFusionConnection Controller", func() {
 				g.Expect(connection.Status.Phase).Should(Equal(workerStatus.WorkerPhase))
 				connectionUrl := fmt.Sprintf("native+%s+%d+%s-%s", workerStatus.WorkerIp, workerStatus.WorkerPort, workerStatus.WorkerName, workerStatus.ResourceVersion)
 				g.Expect(connection.Status.ConnectionURL).Should(Equal(connectionUrl))
-			}, timeout, interval).Should(Succeed())
+			}).Should(Succeed())
 		})
 
 		It("should handle missing workload label", func() {
@@ -122,7 +122,7 @@ var _ = Describe("TensorFusionConnection Controller", func() {
 			Consistently(func(g Gomega) {
 				g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(connectionNoLabel), connectionNoLabel)).Should(Succeed())
 				g.Expect(connectionNoLabel.Status.WorkerName).Should(Equal(""))
-			}, timeout, interval).Should(Succeed())
+			}).Should(Succeed())
 
 			// Clean up the test connection
 			Expect(k8sClient.Delete(ctx, connectionNoLabel)).To(Succeed())
@@ -138,7 +138,7 @@ var _ = Describe("TensorFusionConnection Controller", func() {
 				g.Expect(k8sClient.Get(ctx, typeNamespacedName, connection)).Should(Succeed())
 				workerStatus := workload.Status.WorkerStatuses[0]
 				g.Expect(connection.Status.WorkerName).Should(Equal(workerStatus.WorkerName))
-			}, timeout, interval).Should(Succeed())
+			}).Should(Succeed())
 
 			By("Updating the workload to mark the worker as failed")
 			Expect(k8sClient.Get(ctx, workloadNamespacedName, workload)).To(Succeed())
@@ -154,7 +154,7 @@ var _ = Describe("TensorFusionConnection Controller", func() {
 				g.Expect(connection.Status.Phase).Should(Equal(workerStatus.WorkerPhase))
 				connectionUrl := fmt.Sprintf("native+%s+%d+%s-%s", workerStatus.WorkerIp, workerStatus.WorkerPort, workerStatus.WorkerName, workerStatus.ResourceVersion)
 				g.Expect(connection.Status.ConnectionURL).Should(Equal(connectionUrl))
-			}, timeout, interval).Should(Succeed())
+			}).Should(Succeed())
 		})
 
 		It("should update status to WorkerPending when worker selection fails", func() {
@@ -201,7 +201,7 @@ var _ = Describe("TensorFusionConnection Controller", func() {
 					return false
 				}
 				return len(createdWorkload.Status.WorkerStatuses) == 0
-			}, timeout, interval).Should(BeTrue())
+			}).Should(BeTrue())
 
 			By("Creating a connection to the workload with no workers")
 			failConnectionName := "test-connection-fail"
@@ -230,7 +230,7 @@ var _ = Describe("TensorFusionConnection Controller", func() {
 					return false
 				}
 				return failConnection.Status.Phase == tfv1.WorkerPending
-			}, timeout, interval).Should(BeTrue())
+			}).Should(BeTrue())
 
 			By("Cleaning up test resources")
 			Expect(k8sClient.Delete(ctx, failConnection)).To(Succeed())
diff --git a/internal/controller/tensorfusionworkload_controller.go b/internal/controller/tensorfusionworkload_controller.go
index a10b0085..54cf63a8 100644
--- a/internal/controller/tensorfusionworkload_controller.go
+++ b/internal/controller/tensorfusionworkload_controller.go
@@ -20,7 +20,9 @@ import (
 	"context"
 	"fmt"
 	"sort"
+	"strconv"
 	"strings"
+	"time"
 
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/equality"
@@ -38,20 +40,21 @@ import (
 	"github.com/NexusGPU/tensor-fusion/internal/constants"
 	"github.com/NexusGPU/tensor-fusion/internal/gpuallocator"
 	"github.com/NexusGPU/tensor-fusion/internal/metrics"
+	"github.com/NexusGPU/tensor-fusion/internal/portallocator"
 	"github.com/NexusGPU/tensor-fusion/internal/utils"
 	"github.com/NexusGPU/tensor-fusion/internal/worker"
 	"github.com/lithammer/shortuuid/v4"
-	"github.com/prometheus/client_golang/prometheus"
 	"github.com/samber/lo"
 )
 
 // TensorFusionWorkloadReconciler reconciles a TensorFusionWorkload object
 type TensorFusionWorkloadReconciler struct {
 	client.Client
-	Scheme    *runtime.Scheme
-	Allocator *gpuallocator.GpuAllocator
-	Recorder  record.EventRecorder
-	GpuInfos  *[]config.GpuInfo
+	Scheme        *runtime.Scheme
+	Allocator     *gpuallocator.GpuAllocator
+	Recorder      record.EventRecorder
+	GpuInfos      *[]config.GpuInfo
+	PortAllocator *portallocator.PortAllocator
 }
 
 // +kubebuilder:rbac:groups=tensor-fusion.ai,resources=tensorfusionworkloads,verbs=get;list;watch;create;update;patch;delete
@@ -59,6 +62,8 @@ type TensorFusionWorkloadReconciler struct {
 // +kubebuilder:rbac:groups=tensor-fusion.ai,resources=tensorfusionworkloads/finalizers,verbs=update
 
 // TensorFusionWorkload Reconciler
+//
+//nolint:gocyclo
 func (r *TensorFusionWorkloadReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
 	log := log.FromContext(ctx)
 	log.Info("Reconciling TensorFusionWorkload", "request", req)
@@ -107,7 +112,14 @@ func (r *TensorFusionWorkloadReconciler) Reconcile(ctx context.Context, req ctrl
 	// Process pods with our finalizer
 	for i := range podList.Items {
 		pod := &podList.Items[i]
-		deleted := pod.DeletionTimestamp != nil
+		deleted := !pod.DeletionTimestamp.IsZero()
+
+		if deleted {
+			metrics.RemoveWorkerMetrics(pod.Name, pod.DeletionTimestamp.Time)
+			podPort, _ := strconv.Atoi(pod.Annotations[constants.GenPortNumberAnnotation])
+			_ = r.PortAllocator.ReleaseHostPort(pod.Spec.NodeName, podPort)
+		}
+
 		// Handle our GPU resource cleanup finalizer
 		_, err := utils.HandleFinalizer(ctx, pod, r.Client, func(ctx context.Context, obj *corev1.Pod) (bool, error) {
 			return r.handlePodGPUCleanup(ctx, pod, workload)
@@ -128,6 +140,9 @@ func (r *TensorFusionWorkloadReconciler) Reconcile(ctx context.Context, req ctrl
 		return ctrl.Result{}, nil
 	}
 
+	// init metrics map if needed
+	handleMetricsRecorder(podList, workload)
+
 	// Fetch the GPUPool
 	pool := &tfv1.GPUPool{}
 	if err := r.Get(ctx, client.ObjectKey{Name: workload.Spec.PoolName}, pool); err != nil {
@@ -239,6 +254,14 @@ func (r *TensorFusionWorkloadReconciler) reconcileScaling(
 	return ctrl.Result{}, nil
 }
 
+func handleMetricsRecorder(podList *corev1.PodList, workload *tfv1.TensorFusionWorkload) {
+	now := time.Now()
+	for i := range podList.Items {
+		pod := &podList.Items[i]
+		metrics.SetWorkerMetricsByWorkload(pod, workload, now)
+	}
+}
+
 func (r *TensorFusionWorkloadReconciler) tryStartWorker(
 	ctx context.Context,
 	workerGenerator *worker.WorkerGenerator,
@@ -246,8 +269,14 @@ func (r *TensorFusionWorkloadReconciler) tryStartWorker(
 	workload *tfv1.TensorFusionWorkload,
 	hash string,
 ) (*corev1.Pod, error) {
-	port := workerGenerator.AllocPort()
-	pod, hash, err := workerGenerator.GenerateWorkerPod(gpus, fmt.Sprintf("%s-tf-worker-", workload.Name), workload.Namespace, port, workload.Spec.Resources.Limits, hash)
+	if len(gpus) == 0 || gpus[0].Labels == nil {
+		return nil, fmt.Errorf("no gpus or no labels, can not assign host port for worker")
+	}
+	port, err := r.PortAllocator.AssignHostPort(gpus[0].Status.NodeSelector[constants.KubernetesHostNameLabel])
+	if err != nil {
+		return nil, fmt.Errorf("get host port %w", err)
+	}
+	pod, hash, err := workerGenerator.GenerateWorkerPod(gpus, fmt.Sprintf("%s-tf-worker-", workload.Name), workload.Namespace, port, workload.Spec.Resources.Requests, workload.Spec.Resources.Limits, hash)
 	if err != nil {
 		return nil, fmt.Errorf("generate worker pod %w", err)
 	}
@@ -287,23 +316,12 @@ func (r *TensorFusionWorkloadReconciler) scaleDownWorkers(ctx context.Context, w
 
 	for i := range pods {
 		podToDelete := &pods[i]
-		log.Info("Scaling down worker pod", "name", podToDelete.Name)
+		log.Info("Scaling down worker pod", "name", podToDelete.Name, "workload", workload.Name)
 		// Delete the pod with foreground deletion policy
 		// The finalizer will handle GPU resource cleanup
 		if err := r.deletePod(ctx, podToDelete); err != nil {
 			return err
 		}
-
-		labels := prometheus.Labels{
-			"worker":    podToDelete.Name,
-			"namespace": podToDelete.Namespace,
-			"pool":      workload.Spec.PoolName,
-		}
-		metrics.GpuTflopsRequest.Delete(labels)
-		metrics.GpuTflopsLimit.Delete(labels)
-		metrics.VramBytesRequest.Delete(labels)
-		metrics.VramBytesLimit.Delete(labels)
-		metrics.GpuCount.Delete(labels)
 	}
 	return nil
 }
@@ -339,7 +357,9 @@ func (r *TensorFusionWorkloadReconciler) handlePodGPUCleanup(ctx context.Context
 		return types.NamespacedName{Name: gpuName}
 	})
 	// Release GPU resources
-	if err := r.Allocator.Dealloc(ctx, workload.Spec.Resources.Requests, gpus); err != nil {
+	if err := r.Allocator.Dealloc(ctx,
+		tfv1.NameNamespace{Namespace: workload.Namespace, Name: workload.Name},
+		workload.Spec.Resources.Requests, gpus); err != nil {
 		log.Error(err, "Failed to release GPU resources, will retry", "gpus", gpus, "pod", pod.Name)
 		return false, err
 	}
@@ -367,39 +387,29 @@ func (r *TensorFusionWorkloadReconciler) deletePod(ctx context.Context, pod *cor
 // scaleUpWorkers handles the scaling up of worker pods
 func (r *TensorFusionWorkloadReconciler) scaleUpWorkers(ctx context.Context, workerGenerator *worker.WorkerGenerator, workload *tfv1.TensorFusionWorkload, count int, hash string) (ctrl.Result, error) {
 	log := log.FromContext(ctx)
-
+	workloadNameNs := tfv1.NameNamespace{Namespace: workload.Namespace, Name: workload.Name}
 	// Create worker pods
 	for range count {
 		// Schedule GPU for the worker
-		gpus, err := r.Allocator.Alloc(ctx, workload.Spec.PoolName, workload.Spec.Resources.Requests, workload.Spec.GPUCount, workload.Spec.GPUModel)
+		gpus, err := r.Allocator.Alloc(ctx, workload.Spec.PoolName, workloadNameNs, workload.Spec.Resources.Requests, workload.Spec.GPUCount, workload.Spec.GPUModel)
 		if err != nil {
 			r.Recorder.Eventf(workload, corev1.EventTypeWarning, "ScheduleGPUFailed", "Failed to schedule GPU: %v", err)
 			return ctrl.Result{RequeueAfter: constants.PendingRequeueDuration}, nil
 		}
 
-		pod, err := r.tryStartWorker(ctx, workerGenerator, gpus, workload, hash)
+		_, err = r.tryStartWorker(ctx, workerGenerator, gpus, workload, hash)
 		if err != nil {
 			// Try to release all allocated GPUs if pod creation fails
 			gpus := lo.Map(gpus, func(gpu *tfv1.GPU, _ int) types.NamespacedName {
 				return client.ObjectKeyFromObject(gpu)
 			})
-			releaseErr := r.Allocator.Dealloc(ctx, workload.Spec.Resources.Requests, gpus)
+			releaseErr := r.Allocator.Dealloc(ctx, workloadNameNs, workload.Spec.Resources.Requests, gpus)
 			if releaseErr != nil {
 				log.Error(releaseErr, "Failed to release GPU after pod creation failure", "gpus", gpus)
 			}
 			return ctrl.Result{}, fmt.Errorf("create worker pod: %w", err)
 		}
 
-		labels := prometheus.Labels{
-			"worker":    pod.Name,
-			"namespace": pod.Namespace,
-			"pool":      workload.Spec.PoolName,
-		}
-		metrics.GpuTflopsRequest.With(labels).Set(workload.Spec.Resources.Requests.Tflops.AsApproximateFloat64())
-		metrics.GpuTflopsLimit.With(labels).Set(workload.Spec.Resources.Limits.Tflops.AsApproximateFloat64())
-		metrics.VramBytesRequest.With(labels).Set(workload.Spec.Resources.Requests.Vram.AsApproximateFloat64())
-		metrics.VramBytesLimit.With(labels).Set(workload.Spec.Resources.Limits.Vram.AsApproximateFloat64())
-		metrics.GpuCount.With(labels).Set(float64(workload.Spec.GPUCount))
 	}
 
 	return ctrl.Result{}, nil
diff --git a/internal/controller/tensorfusionworkload_controller_test.go b/internal/controller/tensorfusionworkload_controller_test.go
index ecb7ea74..be27da46 100644
--- a/internal/controller/tensorfusionworkload_controller_test.go
+++ b/internal/controller/tensorfusionworkload_controller_test.go
@@ -17,6 +17,7 @@ limitations under the License.
 package controller
 
 import (
+	"bytes"
 	"strings"
 	"time"
 
@@ -99,7 +100,7 @@ var _ = Describe("TensorFusionWorkload Controller", func() {
 
 				gpuNames := strings.Split(podList.Items[0].Annotations[constants.GpuKey], ",")
 				g.Expect(gpuNames).Should(HaveLen(2))
-			}, timeout, interval).Should(Succeed())
+			}).Should(Succeed())
 
 			checkWorkloadStatus(workload)
 		})
@@ -136,7 +137,14 @@ var _ = Describe("TensorFusionWorkload Controller", func() {
 					client.InNamespace(key.Namespace),
 					client.MatchingLabels{constants.WorkloadKey: key.Name})).Should(Succeed())
 				g.Expect(podList.Items).Should(HaveLen(2))
-			}, timeout, interval).Should(Succeed())
+
+				// Check if metrics is recorded correctly
+				byteWriter := bytes.NewBuffer([]byte{})
+				metricsRecorder.RecordMetrics(byteWriter)
+				str := byteWriter.String()
+				g.Expect(str).Should(MatchRegexp("raw_cost=\\d+"))
+
+			}).Should(Succeed())
 
 			// Store the original pod template hash
 			var originalPodNames []string
@@ -171,7 +179,7 @@ var _ = Describe("TensorFusionWorkload Controller", func() {
 				for _, originalName := range originalPodNames {
 					g.Expect(newPodNames).NotTo(ContainElement(originalName))
 				}
-			}, timeout, interval).Should(Succeed())
+			}).Should(Succeed())
 
 			checkWorkloadStatus(workload)
 		})
@@ -212,7 +220,7 @@ var _ = Describe("TensorFusionWorkload Controller", func() {
 					return gpu.Status.Available.Tflops.Equal(resource.MustParse("1990")) && gpu.Status.Available.Vram.Equal(resource.MustParse("1992Gi"))
 				})
 				return ok
-			}, timeout, interval).Should(BeTrue())
+			}).Should(BeTrue())
 
 			Expect(k8sClient.Get(ctx, key, workload)).Should(Succeed())
 			workloadCopy := workload.DeepCopy()
@@ -224,14 +232,14 @@ var _ = Describe("TensorFusionWorkload Controller", func() {
 					client.InNamespace(key.Namespace),
 					client.MatchingLabels{constants.WorkloadKey: key.Name})).To(Succeed())
 				g.Expect(podList.Items).Should(BeEmpty())
-			}, timeout, interval).Should(Succeed())
+			}).Should(Succeed())
 
 			Eventually(func(g Gomega) {
 				gpu := &tfv1.GPU{}
 				g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(&updatedGPU), gpu)).NotTo(HaveOccurred())
 				g.Expect(gpu.Status.Available.Tflops.Equal(resource.MustParse("2000"))).Should(BeTrue())
 				g.Expect(gpu.Status.Available.Vram.Equal(resource.MustParse("2000Gi"))).Should(BeTrue())
-			}, timeout, interval).Should(Succeed())
+			}).Should(Succeed())
 		})
 	})
 
@@ -248,7 +256,7 @@ var _ = Describe("TensorFusionWorkload Controller", func() {
 				workload.Spec.GPUModel = "mock"
 				// Update the workload
 				g.Expect(k8sClient.Update(ctx, workload)).To(Succeed())
-			}, timeout, interval).Should(Succeed())
+			}).Should(Succeed())
 
 			checkWorkerPodCount(workload)
 			checkWorkloadStatus(workload)
@@ -265,7 +273,7 @@ var _ = Describe("TensorFusionWorkload Controller", func() {
 					return pod.DeletionTimestamp == nil
 				})
 				g.Expect(podList.Items).Should(HaveLen(1))
-			}, timeout, interval).Should(Succeed())
+			}).Should(Succeed())
 
 			// Now check if the pod has the correct GPU
 			Eventually(func(g Gomega) {
@@ -282,7 +290,7 @@ var _ = Describe("TensorFusionWorkload Controller", func() {
 				})
 				g.Expect(ok).To(BeTrue())
 				g.Expect(gpu.Status.GPUModel).To(Equal("mock"))
-			}, timeout, interval).Should(Succeed())
+			}).Should(Succeed())
 		})
 	})
 
@@ -301,7 +309,7 @@ var _ = Describe("TensorFusionWorkload Controller", func() {
 					client.InNamespace(key.Namespace),
 					client.MatchingLabels{constants.WorkloadKey: key.Name})).To(Succeed())
 				g.Expect(podList.Items).To(HaveLen(2))
-			}, timeout, interval).Should(Succeed())
+			}).Should(Succeed())
 
 			// delete workload
 			Expect(k8sClient.Delete(ctx, workload)).To(Succeed())
@@ -313,14 +321,14 @@ var _ = Describe("TensorFusionWorkload Controller", func() {
 					client.InNamespace(key.Namespace),
 					client.MatchingLabels{constants.WorkloadKey: key.Name})).To(Succeed())
 				g.Expect(podList.Items).Should(BeEmpty())
-			}, timeout, interval).Should(Succeed())
+			}).Should(Succeed())
 
 			// wait for workload itself to be deleted
 			Eventually(func(g Gomega) {
 				w := &tfv1.TensorFusionWorkload{}
 				err := k8sClient.Get(ctx, key, w)
 				g.Expect(err).To(HaveOccurred())
-			}, timeout, interval).Should(Succeed())
+			}).Should(Succeed())
 		})
 	})
 
@@ -346,7 +354,7 @@ func checkWorkerPodCount(workload *tfv1.TensorFusionWorkload) {
 			client.InNamespace(workload.Namespace),
 			client.MatchingLabels{constants.WorkloadKey: workload.Name})).Should(Succeed())
 		g.Expect(podList.Items).Should(HaveLen(int(*workload.Spec.Replicas)))
-	}, timeout, interval).Should(Succeed())
+	}).Should(Succeed())
 }
 
 func checkWorkloadStatus(in *tfv1.TensorFusionWorkload) {
@@ -379,7 +387,7 @@ func checkWorkloadStatus(in *tfv1.TensorFusionWorkload) {
 				g.Expect(readyCondition.Message).Should(ContainSubstring("Failed workers:"))
 			}
 		}
-	}, timeout, interval).Should(Succeed())
+	}).Should(Succeed())
 }
 
 func createTensorFusionWorkload(poolName string, key client.ObjectKey, replicas int) *tfv1.TensorFusionWorkload {
@@ -410,6 +418,7 @@ func createTensorFusionWorkload(poolName string, key client.ObjectKey, replicas
 					Vram:   vramLimits,
 				},
 			},
+			Qos: constants.QoSLevelMedium,
 		},
 	}
 
@@ -417,7 +426,7 @@ func createTensorFusionWorkload(poolName string, key client.ObjectKey, replicas
 
 	Eventually(func(g Gomega) {
 		g.Expect(k8sClient.Get(ctx, key, workload)).Should(Succeed())
-	}, timeout, interval).Should(Succeed())
+	}).Should(Succeed())
 
 	return workload
 }
@@ -438,7 +447,7 @@ func cleanupWorkload(key client.ObjectKey) {
 		g.Expect(k8sClient.Get(ctx, key, workload)).Should(Succeed())
 		workload.Spec.Replicas = ptr.Int32(0)
 		g.Expect(k8sClient.Update(ctx, workload)).To(Succeed())
-	}, timeout, interval).Should(Succeed())
+	}).Should(Succeed())
 
 	Eventually(func(g Gomega) {
 		podList := &corev1.PodList{}
@@ -446,12 +455,12 @@ func cleanupWorkload(key client.ObjectKey) {
 			client.InNamespace(key.Namespace),
 			client.MatchingLabels{constants.WorkloadKey: key.Name})).To(Succeed())
 		g.Expect(podList.Items).Should(BeEmpty())
-	}, timeout, interval).Should(Succeed())
+	}).Should(Succeed())
 
 	Expect(k8sClient.Get(ctx, key, workload)).Should(Succeed())
 	Expect(k8sClient.Delete(ctx, workload)).To(Succeed())
 	Eventually(func(g Gomega) {
 		err := k8sClient.Get(ctx, key, workload)
 		g.Expect(err).Should(HaveOccurred())
-	}, timeout, interval).Should(Succeed())
+	}).Should(Succeed())
 }
diff --git a/internal/controller/workloadprofile_controller.go b/internal/controller/workloadprofile_controller.go
index 9e352204..aa385a23 100644
--- a/internal/controller/workloadprofile_controller.go
+++ b/internal/controller/workloadprofile_controller.go
@@ -37,20 +37,9 @@ type WorkloadProfileReconciler struct {
 // +kubebuilder:rbac:groups=tensor-fusion.ai,resources=workloadprofiles/status,verbs=get;update;patch
 // +kubebuilder:rbac:groups=tensor-fusion.ai,resources=workloadprofiles/finalizers,verbs=update
 
-// Reconcile is part of the main kubernetes reconciliation loop which aims to
-// move the current state of the cluster closer to the desired state.
-// TODO(user): Modify the Reconcile function to compare the state specified by
-// the WorkloadProfile object against the actual cluster state, and then
-// perform operations to make the cluster state reflect the state specified by
-// the user.
-//
-// For more details, check Reconcile and its Result here:
-// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.19.1/pkg/reconcile
+// WorkloadProfile is a template to be referred by TensorFusionWorkload, no logic for reconcile
 func (r *WorkloadProfileReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
 	_ = log.FromContext(ctx)
-
-	// TODO(user): your logic here
-
 	return ctrl.Result{}, nil
 }
 
diff --git a/internal/gpuallocator/gpuallocator.go b/internal/gpuallocator/gpuallocator.go
index c9ddbd79..99be3248 100644
--- a/internal/gpuallocator/gpuallocator.go
+++ b/internal/gpuallocator/gpuallocator.go
@@ -4,14 +4,20 @@ package gpuallocator
 import (
 	"context"
 	"fmt"
+	"strings"
 	"sync"
 	"time"
 
 	tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
 	"github.com/NexusGPU/tensor-fusion/internal/constants"
 	"github.com/NexusGPU/tensor-fusion/internal/gpuallocator/filter"
+	"github.com/samber/lo"
+	v1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/client-go/tools/cache"
+	"k8s.io/client-go/util/retry"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/log"
 	"sigs.k8s.io/controller-runtime/pkg/manager"
@@ -51,6 +57,7 @@ type GpuAllocator struct {
 func (s *GpuAllocator) Alloc(
 	ctx context.Context,
 	poolName string,
+	workloadNameNamespace tfv1.NameNamespace,
 	request tfv1.Resource,
 	count uint,
 	gpuModel string,
@@ -101,7 +108,9 @@ func (s *GpuAllocator) Alloc(
 	s.storeMutex.Lock()
 	defer s.storeMutex.Unlock()
 
+	appAdded := false
 	for _, selectedGPU := range selectedGPUs {
+
 		// Get the GPU from the store
 		key := types.NamespacedName{Name: selectedGPU.Name, Namespace: selectedGPU.Namespace}
 		gpu, exists := s.gpuStore[key]
@@ -115,6 +124,11 @@ func (s *GpuAllocator) Alloc(
 		gpu.Status.Available.Tflops.Sub(request.Tflops)
 		gpu.Status.Available.Vram.Sub(request.Vram)
 
+		if !appAdded {
+			addRunningApp(ctx, gpu, workloadNameNamespace)
+			appAdded = true
+		}
+
 		s.markGPUDirty(key)
 	}
 
@@ -128,12 +142,13 @@ func (s *GpuAllocator) Alloc(
 	return result, nil
 }
 
-// Dealloc deallocates a request from one or multiple gpus.
-func (s *GpuAllocator) Dealloc(ctx context.Context, request tfv1.Resource, gpus []types.NamespacedName) error {
+// Dealloc a request from gpu to release available resources on it.
+func (s *GpuAllocator) Dealloc(ctx context.Context, workloadNameNamespace tfv1.NameNamespace, request tfv1.Resource, gpus []types.NamespacedName) error {
 	log := log.FromContext(ctx)
 	s.storeMutex.Lock()
 	defer s.storeMutex.Unlock()
 
+	appRemoved := false
 	for _, gpu := range gpus {
 		// Get the GPU from the store
 		storeGPU, exists := s.gpuStore[gpu]
@@ -145,6 +160,10 @@ func (s *GpuAllocator) Dealloc(ctx context.Context, request tfv1.Resource, gpus
 		// Add resources back to the GPU
 		storeGPU.Status.Available.Tflops.Add(request.Tflops)
 		storeGPU.Status.Available.Vram.Add(request.Vram)
+		if !appRemoved {
+			removeRunningApp(ctx, storeGPU, workloadNameNamespace)
+			appRemoved = true
+		}
 
 		s.markGPUDirty(gpu)
 	}
@@ -221,6 +240,10 @@ func (s *GpuAllocator) initGPUStore(ctx context.Context) error {
 	}
 
 	log.Info("GPU store initialized", "count", len(s.gpuStore))
+
+	// reconcile allocation state based on existing workers
+	s.reconcileAllocationState(ctx)
+	log.Info("GPU store data reconciled")
 	return nil
 }
 
@@ -378,6 +401,8 @@ func (s *GpuAllocator) syncToK8s(ctx context.Context) {
 	s.storeMutex.RLock()
 	defer s.storeMutex.RUnlock()
 
+	dirtyNodes := make(map[string]struct{})
+
 	for _, key := range dirtyGPUs {
 		gpu, exists := s.gpuStore[key]
 		if !exists {
@@ -386,6 +411,8 @@ func (s *GpuAllocator) syncToK8s(ctx context.Context) {
 		// Create a copy to avoid modifying the memory store directly
 		gpuCopy := gpu.DeepCopy()
 
+		dirtyNodes[gpuCopy.Labels[constants.LabelKeyOwner]] = struct{}{}
+
 		// Update the GPU status in Kubernetes
 		if err := s.Status().Update(ctx, gpuCopy); err != nil {
 			// If update fails, put the GPU back in the dirty queue
@@ -395,6 +422,25 @@ func (s *GpuAllocator) syncToK8s(ctx context.Context) {
 			log.Error(err, "Failed to update GPU status, will retry later", "gpu", key.String())
 		}
 	}
+
+	for nodeName := range dirtyNodes {
+		// Refer https://datatracker.ietf.org/doc/html/rfc6901#section-3 encode `/` as `~1`
+		patch := []byte(`[{
+			"op": "add",
+			"path": "/metadata/annotations/` + strings.ReplaceAll(constants.GPULastReportTimeAnnotationKey, "/", "~1") + `",
+			"value": "` + time.Now().Format(time.RFC3339) + `"
+		}]`)
+		err := retry.RetryOnConflict(retry.DefaultBackoff, func() error {
+			return s.Patch(ctx, &tfv1.GPUNode{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: nodeName,
+				},
+			}, client.RawPatch(types.JSONPatchType, patch))
+		})
+		if err != nil {
+			log.Error(err, "Failed to update GPU node last report time, will retry later", "node", nodeName)
+		}
+	}
 }
 
 // listGPUsFromPool gets GPUs from the specified pool using the in-memory store
@@ -417,3 +463,109 @@ func (s *GpuAllocator) markGPUDirty(key types.NamespacedName) {
 	defer s.dirtyQueueLock.Unlock()
 	s.dirtyQueue[key] = struct{}{}
 }
+
+// When it's leader, should reconcile state based on existing workers
+// this function is run inside storeMutex lock
+func (s *GpuAllocator) reconcileAllocationState(ctx context.Context) {
+	logger := log.FromContext(ctx)
+	workers := &v1.PodList{}
+	if err := s.List(ctx, workers, client.MatchingLabels(map[string]string{
+		constants.LabelComponent: constants.ComponentWorker,
+	})); err != nil {
+		logger.Error(err, "Failed to list Workloads to reconcile allocation state")
+		return
+	}
+
+	tflopsCapacityMap := make(map[types.NamespacedName]resource.Quantity)
+	vramCapacityMap := make(map[types.NamespacedName]resource.Quantity)
+	gpuMap := make(map[types.NamespacedName]*tfv1.GPU)
+
+	for gpuKey, gpu := range s.gpuStore {
+		if gpu.Status.Capacity != nil {
+			tflopsCapacityMap[gpuKey] = gpu.Status.Capacity.Tflops
+			vramCapacityMap[gpuKey] = gpu.Status.Capacity.Vram
+			gpu.Status.RunningApps = []*tfv1.RunningAppDetail{}
+			gpuMap[gpuKey] = gpu
+		}
+	}
+
+	for _, worker := range workers.Items {
+		tflopsRequest, _ := resource.ParseQuantity(worker.Annotations[constants.TFLOPSRequestAnnotation])
+		vramRequest, _ := resource.ParseQuantity(worker.Annotations[constants.VRAMRequestAnnotation])
+		gpuIds := worker.Annotations[constants.GpuKey]
+		gpuIdsList := strings.Split(gpuIds, ",")
+		appAdded := false
+		for _, gpuId := range gpuIdsList {
+			gpuKey := types.NamespacedName{Name: gpuId}
+			gpuCapacity, ok := tflopsCapacityMap[gpuKey]
+			if ok {
+				gpuCapacity.Sub(tflopsRequest)
+			}
+			gpuCapacity, ok = vramCapacityMap[gpuKey]
+			if ok {
+				gpuCapacity.Sub(vramRequest)
+			}
+			if !appAdded {
+				addRunningApp(ctx, gpuMap[gpuKey], tfv1.NameNamespace{Namespace: worker.Namespace, Name: worker.Labels[constants.WorkloadKey]})
+				appAdded = true
+			}
+		}
+	}
+
+	for gpuKey, gpu := range s.gpuStore {
+		if gpu.Status.Capacity == nil {
+			log.FromContext(ctx).Info("[Warning] GPU capacity is nil, skip reconcile", "gpu", gpuKey.Name)
+			continue
+		}
+		sameTflops := gpu.Status.Available.Tflops.Equal(tflopsCapacityMap[gpuKey])
+		sameVRAM := gpu.Status.Available.Vram.Equal(vramCapacityMap[gpuKey])
+		if !sameTflops || !sameVRAM {
+			gpu.Status.Available.Tflops = tflopsCapacityMap[gpuKey]
+			gpu.Status.Available.Vram = vramCapacityMap[gpuKey]
+			s.markGPUDirty(gpuKey)
+			log.FromContext(ctx).Info("Correcting gpu available resources", "gpu", gpuKey.Name, "tflops", gpu.Status.Available.Tflops.String(), "vram", gpu.Status.Available.Vram.String())
+		}
+	}
+}
+
+func addRunningApp(ctx context.Context, gpu *tfv1.GPU, workloadNameNamespace tfv1.NameNamespace) {
+	if gpu == nil {
+		log.FromContext(ctx).Info("[Warning] GPU is nil, skip adding running app", "workload", workloadNameNamespace.Name, "namespace", workloadNameNamespace.Namespace)
+		return
+	}
+	if gpu.Status.RunningApps == nil {
+		gpu.Status.RunningApps = []*tfv1.RunningAppDetail{}
+	}
+
+	item, found := lo.Find(gpu.Status.RunningApps, func(app *tfv1.RunningAppDetail) bool {
+		return app.Name == workloadNameNamespace.Name && app.Namespace == workloadNameNamespace.Namespace
+	})
+
+	if found {
+		item.Count++
+	} else {
+		gpu.Status.RunningApps = append(gpu.Status.RunningApps, &tfv1.RunningAppDetail{
+			Name:      workloadNameNamespace.Name,
+			Namespace: workloadNameNamespace.Namespace,
+			Count:     1,
+		})
+	}
+}
+
+func removeRunningApp(ctx context.Context, gpu *tfv1.GPU, workloadNameNamespace tfv1.NameNamespace) {
+	item, found := lo.Find(gpu.Status.RunningApps, func(app *tfv1.RunningAppDetail) bool {
+		return app.Name == workloadNameNamespace.Name && app.Namespace == workloadNameNamespace.Namespace
+	})
+	if found {
+		item.Count--
+		if item.Count == 0 {
+			// scale down to zero, not running any more
+			gpu.Status.RunningApps = lo.Filter(gpu.Status.RunningApps, func(app *tfv1.RunningAppDetail, _ int) bool {
+				return app.Name != workloadNameNamespace.Name && app.Namespace != workloadNameNamespace.Namespace
+			})
+		}
+	} else {
+		// should not happen, if deallocation twice, it should be a bug
+		log.FromContext(ctx).Info("[Warning] The app to remove not found, could be caused by deallocation twice bug", "gpu", gpu.Name, "namespace", gpu.Namespace, "workload", workloadNameNamespace.Name, "namespace", workloadNameNamespace.Namespace)
+	}
+}
diff --git a/internal/gpuallocator/gpuallocator_suite_test.go b/internal/gpuallocator/gpuallocator_suite_test.go
index 6bf4f605..eb3a9418 100644
--- a/internal/gpuallocator/gpuallocator_suite_test.go
+++ b/internal/gpuallocator/gpuallocator_suite_test.go
@@ -265,6 +265,91 @@ var _ = BeforeSuite(func() {
 		Expect(err).NotTo(HaveOccurred())
 	}
 
+	nodes := []tfv1.GPUNode{
+		{
+			ObjectMeta: metav1.ObjectMeta{
+				Name: "node-1",
+				Labels: map[string]string{
+					constants.LabelKeyOwner: "test-pool",
+				},
+			},
+			Spec: tfv1.GPUNodeSpec{
+				ManageMode: tfv1.GPUNodeManageModeAutoSelect,
+			},
+		},
+		{
+			ObjectMeta: metav1.ObjectMeta{
+				Name: "node-2",
+				Labels: map[string]string{
+					constants.LabelKeyOwner: "test-pool",
+				},
+			},
+			Spec: tfv1.GPUNodeSpec{
+				ManageMode: tfv1.GPUNodeManageModeAutoSelect,
+			},
+		},
+		{
+			ObjectMeta: metav1.ObjectMeta{
+				Name: "node-3",
+				Labels: map[string]string{
+					constants.LabelKeyOwner: "test-pool",
+				},
+			},
+			Spec: tfv1.GPUNodeSpec{
+				ManageMode: tfv1.GPUNodeManageModeAutoSelect,
+			},
+		},
+	}
+	for _, node := range nodes {
+		err = k8sClient.Create(ctx, &node)
+		Expect(err).NotTo(HaveOccurred())
+	}
+
+	gpuNodeStatuses := []struct {
+		name   string
+		status tfv1.GPUNodeStatus
+	}{
+		{
+			name: "node-1",
+			status: tfv1.GPUNodeStatus{
+				Phase:           tfv1.TensorFusionGPUNodePhaseRunning,
+				TotalTFlops:     resource.MustParse("200"),
+				TotalVRAM:       resource.MustParse("48Gi"),
+				AvailableTFlops: resource.MustParse("180"),
+				AvailableVRAM:   resource.MustParse("48Gi"),
+			},
+		},
+		{
+			name: "node-2",
+			status: tfv1.GPUNodeStatus{
+				Phase:           tfv1.TensorFusionGPUNodePhaseRunning,
+				TotalTFlops:     resource.MustParse("120"),
+				TotalVRAM:       resource.MustParse("24Gi"),
+				AvailableTFlops: resource.MustParse("120"),
+				AvailableVRAM:   resource.MustParse("24Gi"),
+			},
+		},
+		{
+			name: "node-3",
+			status: tfv1.GPUNodeStatus{
+				Phase:           tfv1.TensorFusionGPUNodePhaseRunning,
+				TotalTFlops:     resource.MustParse("150"),
+				TotalVRAM:       resource.MustParse("48Gi"),
+				AvailableTFlops: resource.MustParse("150"),
+				AvailableVRAM:   resource.MustParse("48Gi"),
+			},
+		},
+	}
+
+	for _, gpuNodeStatus := range gpuNodeStatuses {
+		gpuNode := &tfv1.GPUNode{}
+		err = k8sClient.Get(ctx, types.NamespacedName{Name: gpuNodeStatus.name, Namespace: "default"}, gpuNode)
+		Expect(err).NotTo(HaveOccurred())
+		gpuNode.Status = gpuNodeStatus.status
+		err = k8sClient.Status().Update(ctx, gpuNode)
+		Expect(err).NotTo(HaveOccurred())
+	}
+
 	go func() {
 		defer GinkgoRecover()
 		err = mgr.Start(ctx)
@@ -280,10 +365,18 @@ var _ = AfterSuite(func() {
 })
 
 // Helper function to get a GPU from the API server
-func getGPU(name string, namespace string) *tfv1.GPU {
+func getGPU(name string) *tfv1.GPU {
 	gpu := &tfv1.GPU{}
-	key := types.NamespacedName{Name: name, Namespace: namespace}
+	key := types.NamespacedName{Name: name}
 	err := k8sClient.Get(ctx, key, gpu)
 	ExpectWithOffset(1, err).NotTo(HaveOccurred())
 	return gpu
 }
+
+func getGPUNode(gpu *tfv1.GPU) *tfv1.GPUNode {
+	gpuNode := &tfv1.GPUNode{}
+	key := types.NamespacedName{Name: gpu.Labels[constants.LabelKeyOwner]}
+	err := k8sClient.Get(ctx, key, gpuNode)
+	ExpectWithOffset(1, err).NotTo(HaveOccurred())
+	return gpuNode
+}
diff --git a/internal/gpuallocator/gpuallocator_test.go b/internal/gpuallocator/gpuallocator_test.go
index 9e150762..32388c4e 100644
--- a/internal/gpuallocator/gpuallocator_test.go
+++ b/internal/gpuallocator/gpuallocator_test.go
@@ -30,11 +30,27 @@ import (
 	"sigs.k8s.io/controller-runtime/pkg/client"
 )
 
+var workloadNameNs = tfv1.NameNamespace{Namespace: "default", Name: "test-workload"}
+
 var _ = Describe("GPU Allocator", func() {
 	var allocator *GpuAllocator
 
+	allocateAndSync := func(poolName string, request tfv1.Resource, count uint, gpuModel string) ([]*tfv1.GPU, error) {
+		gpus, err := allocator.Alloc(ctx, poolName, workloadNameNs, request, count, gpuModel)
+		allocator.syncToK8s(ctx)
+		return gpus, err
+	}
+
+	deallocateAndSync := func(gpus []*tfv1.GPU, request tfv1.Resource) {
+		err := allocator.Dealloc(ctx, workloadNameNs, request, lo.Map(gpus, func(gpu *tfv1.GPU, _ int) types.NamespacedName {
+			return client.ObjectKeyFromObject(gpu)
+		}))
+		Expect(err).NotTo(HaveOccurred())
+		allocator.syncToK8s(ctx)
+	}
+
 	BeforeEach(func() {
-		allocator = NewGpuAllocator(ctx, k8sClient, 3*time.Second)
+		allocator = NewGpuAllocator(ctx, k8sClient, 150*time.Millisecond)
 		readyCh, err := allocator.SetupWithManager(ctx, mgr)
 		Expect(err).NotTo(HaveOccurred())
 
@@ -61,17 +77,30 @@ var _ = Describe("GPU Allocator", func() {
 				Vram:   resource.MustParse("8Gi"),
 			}
 
-			gpus, err := allocator.Alloc(ctx, "test-pool", request, 1, "")
+			gpus, err := allocateAndSync("test-pool", request, 1, "")
 			Expect(err).NotTo(HaveOccurred())
 			Expect(gpus).To(HaveLen(1))
 
-			// Explicitly call syncToK8s to persist changes before verification
-			allocator.syncToK8s(ctx)
+			gpuNode := &tfv1.GPUNode{}
+			if err := k8sClient.Get(ctx, types.NamespacedName{Name: gpus[0].Labels[constants.LabelKeyOwner]}, gpuNode); err != nil {
+				Expect(err).NotTo(HaveOccurred())
+			}
+			pool := &tfv1.GPUPool{}
+			if err := k8sClient.Get(ctx, types.NamespacedName{Name: "test-pool"}, pool); err != nil {
+				Expect(err).NotTo(HaveOccurred())
+			}
+			_, _ = RefreshGPUNodeCapacity(ctx, k8sClient, gpuNode, pool)
 
 			// Verify resources were reduced on the allocated GPU
-			gpu := getGPU(gpus[0].Name, gpus[0].Namespace)
+			gpu := getGPU(gpus[0].Name)
 			Expect(gpu.Status.Available.Tflops.Cmp(gpu.Status.Capacity.Tflops)).To(Equal(-1))
 			Expect(gpu.Status.Available.Vram.Cmp(gpu.Status.Capacity.Vram)).To(Equal(-1))
+
+			node := getGPUNode(gpu)
+			diffTflops := node.Status.TotalTFlops.Value() - node.Status.AvailableTFlops.Value()
+			diffVRAM := node.Status.TotalVRAM.Value() - node.Status.AvailableVRAM.Value()
+			Expect(diffTflops).To(BeEquivalentTo(50))
+			Expect(diffVRAM).To(BeEquivalentTo(8 * 1024 * 1024 * 1024))
 		})
 
 		It("should allocate multiple GPUs from the same node", func() {
@@ -80,7 +109,7 @@ var _ = Describe("GPU Allocator", func() {
 				Vram:   resource.MustParse("4Gi"),
 			}
 
-			gpus, err := allocator.Alloc(ctx, "test-pool", request, 2, "")
+			gpus, err := allocateAndSync("test-pool", request, 2, "")
 			Expect(err).NotTo(HaveOccurred())
 			Expect(gpus).To(HaveLen(2))
 
@@ -97,7 +126,7 @@ var _ = Describe("GPU Allocator", func() {
 				Vram:   resource.MustParse("2Gi"),
 			}
 
-			_, err := allocator.Alloc(ctx, "test-pool", request, 10, "")
+			_, err := allocateAndSync("test-pool", request, 10, "")
 			Expect(err).To(HaveOccurred())
 		})
 
@@ -107,7 +136,7 @@ var _ = Describe("GPU Allocator", func() {
 				Vram:   resource.MustParse("64Gi"),
 			}
 
-			_, err := allocator.Alloc(ctx, "test-pool", request, 1, "")
+			_, err := allocateAndSync("test-pool", request, 1, "")
 			Expect(err).To(HaveOccurred())
 		})
 
@@ -117,7 +146,7 @@ var _ = Describe("GPU Allocator", func() {
 				Vram:   resource.MustParse("2Gi"),
 			}
 
-			_, err := allocator.Alloc(ctx, "nonexistent-pool", request, 1, "")
+			_, err := allocateAndSync("nonexistent-pool", request, 1, "")
 			Expect(err).To(HaveOccurred())
 		})
 
@@ -128,13 +157,12 @@ var _ = Describe("GPU Allocator", func() {
 			}
 
 			// Try allocating with a specific GPU model
-			gpus, err := allocator.Alloc(ctx, "test-pool", request, 1, "NVIDIA A100")
+			gpus, err := allocateAndSync("test-pool", request, 1, "NVIDIA A100")
 			Expect(err).NotTo(HaveOccurred())
-			Expect(gpus).To(HaveLen(1))
 			Expect(gpus[0].Status.GPUModel).To(Equal("NVIDIA A100"))
 
 			// Try allocating with a non-existent GPU model
-			_, err = allocator.Alloc(ctx, "test-pool", request, 1, "NonExistentModel")
+			_, err = allocateAndSync("test-pool", request, 1, "NonExistentModel")
 			Expect(err).To(HaveOccurred())
 		})
 	})
@@ -147,7 +175,7 @@ var _ = Describe("GPU Allocator", func() {
 				Vram:   resource.MustParse("6Gi"),
 			}
 
-			gpus, err := allocator.Alloc(ctx, "test-pool", request, 1, "")
+			gpus, err := allocateAndSync("test-pool", request, 1, "")
 			Expect(err).NotTo(HaveOccurred())
 			Expect(gpus).To(HaveLen(1))
 
@@ -157,11 +185,10 @@ var _ = Describe("GPU Allocator", func() {
 			allocatedVram := allocatedGPU.Status.Available.Vram.DeepCopy()
 
 			// Now deallocate
-			err = allocator.Dealloc(ctx, request, []types.NamespacedName{client.ObjectKeyFromObject(gpus[0])})
-			Expect(err).NotTo(HaveOccurred())
+			deallocateAndSync(gpus, request)
 
 			// Verify resources were restored
-			deallocatedGPU := getGPU(allocatedGPU.Name, allocatedGPU.Namespace)
+			deallocatedGPU := getGPU(allocatedGPU.Name)
 			expectedTflops := allocatedTflops.DeepCopy()
 			expectedVram := allocatedVram.DeepCopy()
 			expectedTflops.Add(request.Tflops)
@@ -180,7 +207,7 @@ var _ = Describe("GPU Allocator", func() {
 			}
 
 			// Allocate 2 GPUs
-			allocatedGPUs, err := allocator.Alloc(ctx, "test-pool", request, 2, "")
+			allocatedGPUs, err := allocateAndSync("test-pool", request, 2, "")
 			Expect(err).NotTo(HaveOccurred())
 			Expect(allocatedGPUs).To(HaveLen(2))
 
@@ -209,23 +236,14 @@ var _ = Describe("GPU Allocator", func() {
 					vram:   gpu.Status.Available.Vram.DeepCopy(),
 				}
 			}
-			gpusToDeallocKeys := lo.Map(gpusToDealloc, func(gpu *tfv1.GPU, _ int) types.NamespacedName {
-				return client.ObjectKeyFromObject(gpu)
-			})
+
 			// Now deallocate all GPUs including the non-existent one
-			err = allocator.Dealloc(ctx, request, gpusToDeallocKeys)
-			Expect(err).NotTo(HaveOccurred())
+			deallocateAndSync(gpusToDealloc, request)
 
 			// Verify resources were restored for existing GPUs
 			for _, allocatedGPU := range allocatedGPUs {
-				deallocatedGPU := getGPU(allocatedGPU.Name, allocatedGPU.Namespace)
+				deallocatedGPU := getGPU(allocatedGPU.Name)
 				initialState := initialStates[allocatedGPU.Name]
-
-				expectedTflops := initialState.tflops.DeepCopy()
-				expectedVram := initialState.vram.DeepCopy()
-				expectedTflops.Add(request.Tflops)
-				expectedVram.Add(request.Vram)
-
 				Expect(deallocatedGPU.Status.Available.Tflops.Cmp(initialState.tflops)).To(Equal(1))
 				Expect(deallocatedGPU.Status.Available.Vram.Cmp(initialState.vram)).To(Equal(1))
 			}
@@ -280,7 +298,7 @@ var _ = Describe("GPU Allocator", func() {
 			Expect(exists).To(BeTrue())
 
 			// Get the GPU from the API server
-			gpuToDelete := getGPU("gpu-1", "")
+			gpuToDelete := getGPU("gpu-1")
 
 			// Handle the deletion event
 			allocator.handleGPUDelete(ctx, gpuToDelete)
diff --git a/internal/gpuallocator/node_capacity.go b/internal/gpuallocator/node_capacity.go
new file mode 100644
index 00000000..302d1d04
--- /dev/null
+++ b/internal/gpuallocator/node_capacity.go
@@ -0,0 +1,85 @@
+package gpuallocator
+
+import (
+	"context"
+	"fmt"
+
+	tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
+	"github.com/NexusGPU/tensor-fusion/internal/constants"
+	"k8s.io/apimachinery/pkg/api/equality"
+	"k8s.io/apimachinery/pkg/api/resource"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+)
+
+func RefreshGPUNodeCapacity(ctx context.Context, k8sClient client.Client, node *tfv1.GPUNode, pool *tfv1.GPUPool) ([]string, error) {
+	gpuList := &tfv1.GPUList{}
+	if err := k8sClient.List(ctx, gpuList, client.MatchingLabels{constants.LabelKeyOwner: node.Name}); err != nil {
+		return nil, fmt.Errorf("failed to list GPUs: %w", err)
+	}
+	if len(gpuList.Items) == 0 {
+		// node discovery job not completed, wait next reconcile loop to check again
+		return nil, nil
+	}
+
+	statusCopy := node.Status.DeepCopy()
+
+	node.Status.AvailableVRAM = resource.Quantity{}
+	node.Status.AvailableTFlops = resource.Quantity{}
+	node.Status.TotalTFlops = resource.Quantity{}
+	node.Status.TotalVRAM = resource.Quantity{}
+	node.Status.AllocationInfo = []*tfv1.RunningAppDetail{}
+
+	gpuModels := []string{}
+	deduplicationMap := make(map[string]struct{})
+
+	for _, gpu := range gpuList.Items {
+		node.Status.AvailableVRAM.Add(gpu.Status.Available.Vram)
+		node.Status.AvailableTFlops.Add(gpu.Status.Available.Tflops)
+		node.Status.TotalVRAM.Add(gpu.Status.Capacity.Vram)
+		node.Status.TotalTFlops.Add(gpu.Status.Capacity.Tflops)
+		gpuModels = append(gpuModels, gpu.Status.GPUModel)
+
+		for _, runningApp := range gpu.Status.RunningApps {
+			if _, ok := deduplicationMap[runningApp.Name+"_"+runningApp.Namespace]; !ok {
+				node.Status.AllocationInfo = append(node.Status.AllocationInfo, runningApp.DeepCopy())
+				deduplicationMap[runningApp.Name+"_"+runningApp.Namespace] = struct{}{}
+			}
+		}
+	}
+
+	virtualVRAM, virtualTFlops := calculateVirtualCapacity(node, pool)
+	node.Status.VirtualTFlops = virtualTFlops
+	node.Status.VirtualVRAM = virtualVRAM
+
+	node.Status.Phase = tfv1.TensorFusionGPUNodePhaseRunning
+
+	if !equality.Semantic.DeepEqual(node.Status, statusCopy) {
+		err := k8sClient.Status().Update(ctx, node)
+		if err != nil {
+			return nil, fmt.Errorf("failed to update GPU node status: %w", err)
+		}
+	}
+	return gpuModels, nil
+}
+
+func calculateVirtualCapacity(node *tfv1.GPUNode, pool *tfv1.GPUPool) (resource.Quantity, resource.Quantity) {
+	diskSize, _ := node.Status.NodeInfo.DataDiskSize.AsInt64()
+	ramSize, _ := node.Status.NodeInfo.RAMSize.AsInt64()
+
+	virtualVRAM := node.Status.TotalVRAM.DeepCopy()
+	if pool.Spec.CapacityConfig == nil || pool.Spec.CapacityConfig.Oversubscription == nil {
+		return virtualVRAM, node.Status.TotalTFlops.DeepCopy()
+	}
+	vTFlops := node.Status.TotalTFlops.AsApproximateFloat64() * (float64(pool.Spec.CapacityConfig.Oversubscription.TFlopsOversellRatio) / 100.0)
+
+	virtualVRAM.Add(*resource.NewQuantity(
+		int64(float64(float64(diskSize)*float64(pool.Spec.CapacityConfig.Oversubscription.VRAMExpandToHostDisk)/100.0)),
+		resource.DecimalSI),
+	)
+	virtualVRAM.Add(*resource.NewQuantity(
+		int64(float64(float64(ramSize)*float64(pool.Spec.CapacityConfig.Oversubscription.VRAMExpandToHostMem)/100.0)),
+		resource.DecimalSI),
+	)
+
+	return virtualVRAM, *resource.NewQuantity(int64(vTFlops), resource.DecimalSI)
+}
diff --git a/internal/metrics/recorder.go b/internal/metrics/recorder.go
new file mode 100644
index 00000000..4f1629dc
--- /dev/null
+++ b/internal/metrics/recorder.go
@@ -0,0 +1,286 @@
+package metrics
+
+import (
+	"io"
+	"sync"
+	"time"
+
+	tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
+	"github.com/NexusGPU/tensor-fusion/internal/constants"
+	metricsProto "github.com/influxdata/line-protocol/v2/lineprotocol"
+	"gopkg.in/natefinch/lumberjack.v2"
+	corev1 "k8s.io/api/core/v1"
+	ctrl "sigs.k8s.io/controller-runtime"
+)
+
+// Worker level metrics, include worker resources/costs status
+// map updated in one reconcile loop in single goroutine, thus no RW lock needed
+var workerMetricsLock sync.RWMutex
+var workerMetricsMap = map[string]*WorkerMetrics{}
+
+// Node level metrics, include node allocation/costs status
+var nodeMetricsLock sync.RWMutex
+var NodeMetricsMap = map[string]*NodeMetrics{}
+
+var log = ctrl.Log.WithName("metrics-recorder")
+
+type MetricsRecorder struct {
+	MetricsOutputPath string
+
+	// Raw billing result for node and workers
+	HourlyUnitPriceMap map[string]float64
+
+	// Worker level unit price map, key is pool name, second level key is QoS level
+	WorkerUnitPriceMap map[string]map[string]RawBillingPricing
+}
+
+func RemoveWorkerMetrics(workerName string, deletionTime time.Time) {
+	workerMetricsLock.Lock()
+	// to get more accurate metrics, should record the deletion timestamp to calculate duration for the last metrics
+	workerMetricsMap[workerName].DeletionTimestamp = deletionTime
+	workerMetricsLock.Unlock()
+}
+
+func RemoveNodeMetrics(nodeName string) {
+	nodeMetricsLock.Lock()
+	// Node lifecycle is much longer than worker, so just delete the metrics, 1 minute metrics interval is enough
+	delete(NodeMetricsMap, nodeName)
+	nodeMetricsLock.Unlock()
+}
+
+func SetWorkerMetricsByWorkload(pod *corev1.Pod, workload *tfv1.TensorFusionWorkload, now time.Time) {
+	workerMetricsLock.Lock()
+	defer workerMetricsLock.Unlock()
+
+	// Initialize metrics
+	if _, ok := workerMetricsMap[pod.Name]; !ok {
+		workerMetricsMap[pod.Name] = &WorkerMetrics{
+			WorkerName:     pod.Name,
+			WorkloadName:   workload.Name,
+			PoolName:       workload.Spec.PoolName,
+			Namespace:      pod.Namespace,
+			QoS:            string(workload.Spec.Qos),
+			RawCost:        0,
+			LastRecordTime: now,
+		}
+	}
+
+	// Update metrics fields that are mutable
+	metricsItem := workerMetricsMap[pod.Name]
+	metricsItem.TflopsRequest = workload.Spec.Resources.Requests.Tflops.AsApproximateFloat64()
+	metricsItem.TflopsLimit = workload.Spec.Resources.Limits.Tflops.AsApproximateFloat64()
+	metricsItem.VramBytesRequest = workload.Spec.Resources.Requests.Vram.AsApproximateFloat64()
+	metricsItem.VramBytesLimit = workload.Spec.Resources.Limits.Vram.AsApproximateFloat64()
+	if workload.Spec.GPUCount <= 0 {
+		// handle invalid data if exists
+		metricsItem.GPUCount = 1
+	} else {
+		metricsItem.GPUCount = int(workload.Spec.GPUCount)
+	}
+	metricsItem.WorkloadName = workload.Name
+
+}
+
+func SetNodeMetrics(node *tfv1.GPUNode, poolObj *tfv1.GPUPool, gpuModels []string) {
+	nodeMetricsLock.Lock()
+	defer nodeMetricsLock.Unlock()
+
+	if _, ok := NodeMetricsMap[node.Name]; !ok {
+		NodeMetricsMap[node.Name] = &NodeMetrics{
+			NodeName:       node.Name,
+			RawCost:        0,
+			LastRecordTime: time.Now(),
+		}
+	}
+	// Fields that possibly change after initialization
+	metricsItem := NodeMetricsMap[node.Name]
+	metricsItem.PoolName = poolObj.Name
+	metricsItem.GPUModels = gpuModels
+
+	totalTflops := node.Status.TotalTFlops.AsApproximateFloat64()
+	totalVram := node.Status.TotalVRAM.AsApproximateFloat64()
+
+	metricsItem.AllocatedTflops = totalTflops - node.Status.AvailableTFlops.AsApproximateFloat64()
+	if totalTflops <= 0 {
+		metricsItem.AllocatedTflopsPercent = 0
+	} else {
+		metricsItem.AllocatedTflopsPercent = metricsItem.AllocatedTflops / totalTflops * 100
+	}
+
+	metricsItem.AllocatedVramBytes = totalVram - node.Status.AvailableVRAM.AsApproximateFloat64()
+	if totalVram <= 0 {
+		metricsItem.AllocatedVramPercent = 0
+	} else {
+		metricsItem.AllocatedVramPercent = metricsItem.AllocatedVramBytes / totalVram * 100
+	}
+
+	totalVirtualTflops := node.Status.VirtualTFlops.AsApproximateFloat64()
+	totalVirtualVram := node.Status.VirtualVRAM.AsApproximateFloat64()
+	if totalVirtualTflops <= 0 {
+		metricsItem.AllocatedTflopsPercentToVirtualCap = 0
+	} else {
+		metricsItem.AllocatedTflopsPercentToVirtualCap = metricsItem.AllocatedTflops / totalVirtualTflops * 100
+	}
+	if totalVirtualVram <= 0 {
+		metricsItem.AllocatedVramPercentToVirtualCap = 0
+	} else {
+		metricsItem.AllocatedVramPercentToVirtualCap = metricsItem.AllocatedVramBytes / totalVirtualVram * 100
+	}
+}
+
+// Start metrics recorder
+// The leader container will fill the metrics map, so followers don't have metrics point
+// thus metrics recorder only printed in one controller instance
+// One minute interval could cause some metrics ignored or billing not accurate, known issue
+func (mr *MetricsRecorder) Start() {
+
+	ticker := time.NewTicker(time.Minute)
+
+	writer := &lumberjack.Logger{
+		Filename:   mr.MetricsOutputPath,
+		MaxSize:    100,
+		MaxBackups: 10,
+		MaxAge:     28,
+	}
+
+	// Record metrics
+	go func() {
+		for {
+			<-ticker.C
+			mr.RecordMetrics(writer)
+		}
+	}()
+
+	// Clean up worker metrics that have been deleted
+	go func() {
+		for {
+			time.Sleep(5 * time.Minute)
+			workerMetricsLock.Lock()
+			for _, metrics := range workerMetricsMap {
+				if !metrics.DeletionTimestamp.IsZero() {
+					delete(workerMetricsMap, metrics.WorkerName)
+				}
+			}
+			workerMetricsLock.Unlock()
+		}
+	}()
+}
+
+func (mr *MetricsRecorder) RecordMetrics(writer io.Writer) {
+	if len(workerMetricsMap) <= 0 && len(NodeMetricsMap) <= 0 {
+		return
+	}
+
+	now := time.Now()
+
+	var enc metricsProto.Encoder
+	enc.SetPrecision(metricsProto.Millisecond)
+
+	workerMetricsLock.RLock()
+
+	activeWorkerCnt := 0
+	for _, metrics := range workerMetricsMap {
+
+		if !metrics.DeletionTimestamp.IsZero() {
+			metrics.RawCost = mr.getWorkerRawCost(metrics, metrics.DeletionTimestamp.Sub(metrics.LastRecordTime))
+		} else {
+			metrics.RawCost = mr.getWorkerRawCost(metrics, now.Sub(metrics.LastRecordTime))
+		}
+		metrics.LastRecordTime = now
+
+		// Skip recording metrics if raw cost is negative
+		// which means worker already deleted waiting for cleanup
+		if metrics.RawCost < 0 {
+			continue
+		}
+		activeWorkerCnt++
+		enc.StartLine("tf_worker_metrics")
+		enc.AddTag("namespace", metrics.Namespace)
+		enc.AddTag("pool_name", metrics.PoolName)
+		enc.AddTag("qos", metrics.QoS)
+		enc.AddTag("worker_name", metrics.WorkerName)
+		enc.AddTag("workload_name", metrics.WorkloadName)
+
+		enc.AddField("gpu_count", metricsProto.MustNewValue(int64(metrics.GPUCount)))
+		enc.AddField("tflops_limit", metricsProto.MustNewValue(metrics.TflopsLimit))
+		enc.AddField("tflops_request", metricsProto.MustNewValue(metrics.TflopsRequest))
+		enc.AddField("raw_cost", metricsProto.MustNewValue(metrics.RawCost))
+		enc.AddField("vram_bytes_limit", metricsProto.MustNewValue(metrics.VramBytesLimit))
+		enc.AddField("vram_bytes_request", metricsProto.MustNewValue(metrics.VramBytesRequest))
+
+		enc.EndLine(now)
+	}
+	enc.StartLine("tf_system_metrics")
+	enc.AddField("total_workers_cnt", metricsProto.MustNewValue(int64(activeWorkerCnt)))
+	workerMetricsLock.RUnlock()
+
+	nodeMetricsLock.RLock()
+	for _, metrics := range NodeMetricsMap {
+		metrics.RawCost = mr.getNodeRawCost(metrics, now.Sub(metrics.LastRecordTime), mr.HourlyUnitPriceMap)
+		metrics.LastRecordTime = now
+
+		enc.StartLine("tf_node_metrics")
+
+		enc.AddTag("node_name", metrics.NodeName)
+		enc.AddTag("pool_name", metrics.PoolName)
+
+		enc.AddField("allocated_tflops", metricsProto.MustNewValue(metrics.AllocatedTflops))
+		enc.AddField("allocated_tflops_percent", metricsProto.MustNewValue(metrics.AllocatedTflopsPercent))
+		enc.AddField("allocated_vram_bytes", metricsProto.MustNewValue(metrics.AllocatedVramBytes))
+		enc.AddField("allocated_vram_percent", metricsProto.MustNewValue(metrics.AllocatedVramPercent))
+		enc.AddField("gpu_count", metricsProto.MustNewValue(int64(len(metrics.GPUModels))))
+		enc.AddField("raw_cost", metricsProto.MustNewValue(metrics.RawCost))
+		enc.EndLine(now)
+	}
+	enc.StartLine("tf_system_metrics")
+	enc.AddField("total_nodes_cnt", metricsProto.MustNewValue(int64(len(NodeMetricsMap))))
+	enc.EndLine(now)
+
+	nodeMetricsLock.RUnlock()
+
+	if err := enc.Err(); err != nil {
+		log.Error(err, "metrics encoding error", "workerCount", activeWorkerCnt, "nodeCount", len(NodeMetricsMap))
+	}
+
+	if _, err := writer.Write(enc.Bytes()); err != nil {
+		log.Error(err, "metrics writing error", "workerCount", activeWorkerCnt, "nodeCount", len(NodeMetricsMap))
+	}
+	log.Info("metrics and raw billing recorded:", "workerCount", activeWorkerCnt, "nodeCount", len(NodeMetricsMap))
+}
+
+func (mr *MetricsRecorder) getWorkerRawCost(metrics *WorkerMetrics, duration time.Duration) float64 {
+	qosPricing, ok := mr.WorkerUnitPriceMap[metrics.PoolName]
+	// The qos pricing for this pool not set
+	if !ok {
+		return 0
+	}
+	// The price of current qos not defined for this pool
+	qosLevel := metrics.QoS
+	if qosLevel == "" {
+		qosLevel = constants.QoSLevelMedium
+	}
+	pricing, ok := qosPricing[qosLevel]
+	if !ok {
+		return 0
+	}
+	if duration < 0 {
+		return -1
+	}
+
+	rawCostTflopsLimitOverRequest := (metrics.TflopsLimit - metrics.TflopsRequest) * pricing.TflopsOverRequestPerSecond
+	rawCostPerTflops := pricing.TflopsPerSecond * metrics.TflopsRequest
+
+	rawCostVRAMLimitOverRequest := (metrics.VramBytesLimit - metrics.VramBytesRequest) * pricing.VramOverRequestPerSecond / constants.GiBToBytes
+	rawCostPerVRAM := pricing.VramPerSecond * metrics.VramBytesRequest / constants.GiBToBytes
+
+	return (rawCostPerTflops + rawCostPerVRAM + rawCostTflopsLimitOverRequest + rawCostVRAMLimitOverRequest) * duration.Seconds() * float64(metrics.GPUCount)
+}
+
+// unit price data comes from global config map, and multi-GPU instance should normalized with per GPU pricing, e.g. 8xA100 p4d.24xlarge price should divide by 8
+func (mr *MetricsRecorder) getNodeRawCost(metrics *NodeMetrics, duration time.Duration, hourlyUnitPriceMap map[string]float64) float64 {
+	cost := 0.0
+	for _, gpuModel := range metrics.GPUModels {
+		cost += metrics.AllocatedTflops * duration.Hours() * hourlyUnitPriceMap[gpuModel]
+	}
+	return cost
+}
diff --git a/internal/metrics/type.go b/internal/metrics/type.go
new file mode 100644
index 00000000..fdde703a
--- /dev/null
+++ b/internal/metrics/type.go
@@ -0,0 +1,53 @@
+package metrics
+
+import "time"
+
+// Metrics will be stored in a map, key is the worker name, value is the metrics
+// By default, metrics will be updated every minute
+type WorkerMetrics struct {
+	WorkerName   string `json:"workerName"`
+	WorkloadName string `json:"workloadName"`
+	PoolName     string `json:"poolName"`
+	Namespace    string `json:"namespace"`
+	QoS          string `json:"qos"`
+
+	TflopsRequest    float64 `json:"tflopsRequest"`
+	TflopsLimit      float64 `json:"tflopsLimit"`
+	VramBytesRequest float64 `json:"vramBytesRequest"`
+	VramBytesLimit   float64 `json:"vramBytesLimit"`
+	GPUCount         int     `json:"gpuCount"`
+	RawCost          float64 `json:"rawCost"`
+
+	LastRecordTime time.Time `json:"lastRecordTime"`
+
+	// For more accurate metrics, should record the deletion timestamp to calculate duration for the last metrics
+	DeletionTimestamp time.Time `json:"deletionTimestamp"`
+}
+
+type NodeMetrics struct {
+	NodeName string `json:"nodeName"`
+	PoolName string `json:"poolName"`
+
+	AllocatedTflops        float64 `json:"allocatedTflops"`
+	AllocatedTflopsPercent float64 `json:"allocatedTflopsPercent"`
+	AllocatedVramBytes     float64 `json:"allocatedVramBytes"`
+	AllocatedVramPercent   float64 `json:"allocatedVramPercent"`
+
+	AllocatedTflopsPercentToVirtualCap float64 `json:"allocatedTflopsPercentToVirtualCap"`
+	AllocatedVramPercentToVirtualCap   float64 `json:"allocatedVramPercentToVirtualCap"`
+
+	RawCost float64 `json:"rawCost"`
+
+	LastRecordTime time.Time `json:"lastRecordTime"`
+
+	// additional field for raw cost calculation since each GPU has different price
+	GPUModels []string `json:"gpuModels"`
+}
+
+type RawBillingPricing struct {
+	TflopsPerSecond float64
+	VramPerSecond   float64
+
+	TflopsOverRequestPerSecond float64
+	VramOverRequestPerSecond   float64
+}
diff --git a/internal/metrics/worker.go b/internal/metrics/worker.go
deleted file mode 100644
index 3e5bf843..00000000
--- a/internal/metrics/worker.go
+++ /dev/null
@@ -1,78 +0,0 @@
-package metrics
-
-import (
-	"github.com/prometheus/client_golang/prometheus"
-	"sigs.k8s.io/controller-runtime/pkg/metrics"
-)
-
-var (
-	labels = []string{
-		"namespace", "worker", "pool",
-	}
-
-	nodeLabels = []string{
-		"nodeName", "pool",
-	}
-
-	GpuTflopsRequest = prometheus.NewGaugeVec(
-		prometheus.GaugeOpts{
-			Name: "gpu_tflops_request",
-		},
-		labels,
-	)
-
-	GpuTflopsLimit = prometheus.NewGaugeVec(
-		prometheus.GaugeOpts{
-			Name: "gpu_tflops_limit",
-		},
-		labels,
-	)
-
-	VramBytesRequest = prometheus.NewGaugeVec(
-		prometheus.GaugeOpts{
-			Name: "vram_bytes_request",
-		},
-		labels,
-	)
-
-	VramBytesLimit = prometheus.NewGaugeVec(
-		prometheus.GaugeOpts{
-			Name: "vram_bytes_limit",
-		},
-		labels,
-	)
-
-	GpuCount = prometheus.NewGaugeVec(
-		prometheus.GaugeOpts{
-			Name: "gpu_count",
-			Help: "Number of GPUs allocated to the workload",
-		},
-		labels,
-	)
-
-	AllocatedTflopsPercent = prometheus.NewGaugeVec(
-		prometheus.GaugeOpts{
-			Name: "allocated_compute_percentage",
-		},
-		nodeLabels,
-	)
-
-	AllocatedVramBytes = prometheus.NewGaugeVec(
-		prometheus.GaugeOpts{
-			Name: "allocated_memory_bytes",
-		},
-		nodeLabels,
-	)
-)
-
-func init() {
-	metrics.Registry.MustRegister(
-		GpuTflopsRequest,
-		GpuTflopsLimit,
-		VramBytesRequest,
-		VramBytesLimit,
-		AllocatedTflopsPercent,
-		AllocatedVramBytes,
-		GpuCount,
-	)
-}
diff --git a/internal/portallocator/portallocator.go b/internal/portallocator/portallocator.go
new file mode 100644
index 00000000..72707c22
--- /dev/null
+++ b/internal/portallocator/portallocator.go
@@ -0,0 +1,268 @@
+package portallocator
+
+import (
+	"context"
+	"fmt"
+	"math/bits"
+	"strconv"
+	"strings"
+	"sync"
+
+	"github.com/NexusGPU/tensor-fusion/internal/constants"
+	"github.com/NexusGPU/tensor-fusion/internal/utils"
+	v1 "k8s.io/api/core/v1"
+	"k8s.io/client-go/util/retry"
+
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+	"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
+	"sigs.k8s.io/controller-runtime/pkg/log"
+	"sigs.k8s.io/controller-runtime/pkg/manager"
+)
+
+// Offer API for host port allocation, range from user configured port range
+// Use label: `tensor-fusion.ai/host-port: auto` to assigned port at cluster level
+// vGPU worker's hostPort will be managed by operator
+type PortAllocator struct {
+	PortRangeStartNode int
+	PortRangeEndNode   int
+
+	PortRangeStartCluster int
+	PortRangeEndCluster   int
+
+	IsLeader bool
+
+	BitmapPerNode map[string][]uint64
+	BitmapCluster []uint64
+
+	Client client.Client
+
+	storeMutexNode    sync.RWMutex
+	storeMutexCluster sync.RWMutex
+}
+
+func NewPortAllocator(ctx context.Context, client client.Client, nodeLevelPortRange string, clusterLevelPortRange string) (*PortAllocator, error) {
+	if client == nil {
+		return nil, fmt.Errorf("client cannot be nil")
+	}
+
+	nodeLevelRange := strings.Split(nodeLevelPortRange, "-")
+	clusterLevelRange := strings.Split(clusterLevelPortRange, "-")
+
+	portRangeStartNode, _ := strconv.Atoi(nodeLevelRange[0])
+	portRangeEndNode, _ := strconv.Atoi(nodeLevelRange[1])
+
+	portRangeStartCluster, _ := strconv.Atoi(clusterLevelRange[0])
+	portRangeEndCluster, _ := strconv.Atoi(clusterLevelRange[1])
+
+	allocator := &PortAllocator{
+		PortRangeStartNode:    portRangeStartNode,
+		PortRangeEndNode:      portRangeEndNode,
+		PortRangeStartCluster: portRangeStartCluster,
+		PortRangeEndCluster:   portRangeEndCluster,
+		Client:                client,
+		IsLeader:              false,
+		BitmapPerNode:         make(map[string][]uint64),
+		BitmapCluster:         make([]uint64, (portRangeEndCluster-portRangeStartCluster)/64+1),
+
+		storeMutexNode:    sync.RWMutex{},
+		storeMutexCluster: sync.RWMutex{},
+	}
+
+	return allocator, nil
+}
+
+func (s *PortAllocator) SetupWithManager(ctx context.Context, mgr manager.Manager) <-chan struct{} {
+	readyCh := make(chan struct{}, 1)
+	_ = mgr.Add(manager.RunnableFunc(func(ctx context.Context) error {
+		<-mgr.Elected()
+		s.IsLeader = true
+		leaderInfo := &v1.ConfigMap{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      constants.LeaderInfoConfigMapName,
+				Namespace: utils.CurrentNamespace(),
+			},
+		}
+		err := retry.RetryOnConflict(retry.DefaultBackoff, func() error {
+			_, err := controllerutil.CreateOrUpdate(ctx, s.Client, leaderInfo, func() error {
+				leaderInfo.Data = map[string]string{
+					constants.LeaderInfoConfigMapLeaderIPKey: utils.CurrentIP(),
+				}
+				return nil
+			})
+			return err
+		})
+		if err != nil {
+			log.FromContext(ctx).Error(err, "Failed to update leader IP info in ConfigMap")
+		}
+
+		s.storeMutexNode.Lock()
+		s.storeMutexCluster.Lock()
+		defer s.storeMutexNode.Unlock()
+		defer s.storeMutexCluster.Unlock()
+
+		// 1. init bit map from existing pods labeled with tensor-fusion.ai/host-port=auto
+		s.initBitMapForClusterLevelPortAssign(ctx)
+
+		// 2. init bit map for existing vGPU workers
+		s.initBitMapForNodeLevelPortAssign(ctx)
+
+		readyCh <- struct{}{}
+		return nil
+	}))
+	return readyCh
+}
+
+func (s *PortAllocator) GetLeaderIP() string {
+	leaderInfo := &v1.ConfigMap{}
+	err := s.Client.Get(context.Background(), client.ObjectKey{
+		Name:      constants.LeaderInfoConfigMapName,
+		Namespace: utils.CurrentNamespace(),
+	}, leaderInfo)
+	if err != nil {
+		log.FromContext(context.Background()).Error(err, "Failed to get leader IP info from ConfigMap")
+		return ""
+	}
+	if leaderInfo.Data == nil {
+		return ""
+	}
+	return leaderInfo.Data[constants.LeaderInfoConfigMapLeaderIPKey]
+}
+
+// AssignHostPort always called by operator itself, thus no Leader-Follower inconsistency issue
+func (s *PortAllocator) AssignHostPort(nodeName string) (int, error) {
+	if nodeName == "" {
+		return 0, fmt.Errorf("node name cannot be empty when assign host port")
+	}
+	s.storeMutexNode.Lock()
+	defer s.storeMutexNode.Unlock()
+
+	bitmap, ok := s.BitmapPerNode[nodeName]
+	if !ok {
+		// found new nodes not have any ports assigned before
+		bitmapSize := (s.PortRangeEndNode - s.PortRangeStartNode + 63) / 64
+		s.BitmapPerNode[nodeName] = make([]uint64, bitmapSize)
+		bitmap = s.BitmapPerNode[nodeName]
+	}
+	for i, subMap := range bitmap {
+		bitPos := bits.TrailingZeros64(^subMap)
+		portOffset := i*64 + bitPos
+		if subMap != 0xFFFFFFFFFFFFFFFF {
+			assignedPort := portOffset + s.PortRangeStartNode
+			if assignedPort < s.PortRangeEndNode {
+				bitmap[i] = subMap | (1 << bitPos)
+				return assignedPort, nil
+			} else {
+				break
+			}
+		}
+	}
+	return 0, fmt.Errorf("no available port on node %s", nodeName)
+
+}
+
+func (s *PortAllocator) ReleaseHostPort(nodeName string, port int) error {
+	if port == 0 {
+		return fmt.Errorf("port cannot be 0 when release host port, may caused by portNumber annotation not detected, nodeName: %s", nodeName)
+	}
+	s.storeMutexNode.Lock()
+	defer s.storeMutexNode.Unlock()
+
+	if bitmap, ok := s.BitmapPerNode[nodeName]; !ok {
+		return fmt.Errorf("node %s not found in bitmap", nodeName)
+	} else {
+		portOffset := port - s.PortRangeStartNode
+		bitmap[portOffset/64] &^= 1 << (portOffset % 64)
+	}
+	return nil
+}
+
+func (s *PortAllocator) AssignClusterLevelHostPort(podName string) (int, error) {
+
+	s.storeMutexCluster.Lock()
+	defer s.storeMutexCluster.Unlock()
+
+	for i, subMap := range s.BitmapCluster {
+		bitPos := bits.TrailingZeros64(^subMap)
+		portOffset := i*64 + bitPos
+		if subMap != 0xFFFFFFFFFFFFFFFF {
+			assignedPort := portOffset + s.PortRangeStartCluster
+			if assignedPort < s.PortRangeEndCluster {
+				s.BitmapCluster[i] |= 1 << bitPos
+				return assignedPort, nil
+			}
+		}
+	}
+	return 0, fmt.Errorf("no available port on cluster")
+}
+
+func (s *PortAllocator) ReleaseClusterLevelHostPort(podName string, port int) error {
+	if port == 0 {
+		return fmt.Errorf("port cannot be 0 when release host port, may caused by portNumber annotation not detected, podName: %s", podName)
+	}
+
+	// TODO, may need a defer queue for releasing so that to avoid port being assigned again too fast
+
+	s.storeMutexCluster.Lock()
+	defer s.storeMutexCluster.Unlock()
+
+	portOffset := port - s.PortRangeStartCluster
+	s.BitmapCluster[portOffset/64] &^= 1 << (portOffset % 64)
+	return nil
+}
+
+func (s *PortAllocator) initBitMapForClusterLevelPortAssign(ctx context.Context) {
+	log := log.FromContext(ctx)
+	podList := &v1.PodList{}
+	err := s.Client.List(ctx, podList, client.MatchingLabels{constants.GenHostPortLabel: constants.GenHostPortLabelValue})
+	if err != nil {
+		log.Error(err, "failed to list pods with port allocation label")
+		return
+	}
+	usedPorts := []uint16{}
+	for _, pod := range podList.Items {
+		if pod.Annotations == nil {
+			continue
+		}
+		port, _ := strconv.Atoi(pod.Annotations[constants.GenPortNumberAnnotation])
+		if port > s.PortRangeEndCluster || port < s.PortRangeStartCluster {
+			log.Error(err, "existing Pod's host port out of range", "port", port, "expected-start", s.PortRangeStartCluster, "expected-end", s.PortRangeEndCluster, "pod", pod.Name)
+			continue
+		}
+		bitOffSet := port - s.PortRangeStartCluster
+
+		usedPorts = append(usedPorts, uint16(bitOffSet))
+	}
+
+	for _, port := range usedPorts {
+		s.BitmapCluster[port/64] |= 1 << (port % 64)
+	}
+}
+
+func (s *PortAllocator) initBitMapForNodeLevelPortAssign(ctx context.Context) {
+	log := log.FromContext(ctx)
+	podList := &v1.PodList{}
+	err := s.Client.List(ctx, podList, client.MatchingLabels{constants.LabelComponent: constants.ComponentWorker})
+	if err != nil {
+		log.Error(err, "failed to list pods with port allocation label")
+		return
+	}
+
+	size := (s.PortRangeEndNode-s.PortRangeStartNode)/64 + 1
+	for _, pod := range podList.Items {
+		if pod.Annotations == nil {
+			continue
+		}
+		port, _ := strconv.Atoi(pod.Annotations[constants.GenPortNumberAnnotation])
+		if port > s.PortRangeEndNode || port < s.PortRangeStartNode {
+			log.Error(err, "existing Pod's node level host port out of range", "port", port, "expected-start", s.PortRangeStartNode, "expected-end", s.PortRangeEndNode, "pod", pod.Name, "node", pod.Spec.NodeName)
+			continue
+		}
+		bitOffSet := port - s.PortRangeStartNode
+		if _, ok := s.BitmapPerNode[pod.Spec.NodeName]; !ok {
+			s.BitmapPerNode[pod.Spec.NodeName] = make([]uint64, size)
+		}
+		s.BitmapPerNode[pod.Spec.NodeName][bitOffSet/64] |= 1 << (bitOffSet % 64)
+	}
+
+}
diff --git a/internal/portallocator/portallocator_suite_test.go b/internal/portallocator/portallocator_suite_test.go
new file mode 100644
index 00000000..ec9b4566
--- /dev/null
+++ b/internal/portallocator/portallocator_suite_test.go
@@ -0,0 +1,158 @@
+package portallocator
+
+import (
+	"context"
+	"fmt"
+	"path/filepath"
+	"runtime"
+	"testing"
+
+	tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
+	"github.com/NexusGPU/tensor-fusion/internal/constants"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/client-go/kubernetes/scheme"
+	"k8s.io/client-go/rest"
+	ctrl "sigs.k8s.io/controller-runtime"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+	"sigs.k8s.io/controller-runtime/pkg/envtest"
+	logf "sigs.k8s.io/controller-runtime/pkg/log"
+	"sigs.k8s.io/controller-runtime/pkg/log/zap"
+	metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
+)
+
+// These tests use Ginkgo (BDD-style Go testing framework). Refer to
+// http://onsi.github.io/ginkgo/ to learn more about Ginkgo.
+
+var (
+	cancel    context.CancelFunc
+	cfg       *rest.Config
+	ctx       context.Context
+	k8sClient client.Client
+	testEnv   *envtest.Environment
+	mgr       ctrl.Manager
+	pa        *PortAllocator
+)
+
+func TestPortAllocator(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "Port Allocator Suite")
+}
+func genHostPortPod(name string, nodeName string, port int32, clusterLevel bool) corev1.Pod {
+	var labels map[string]string
+	if clusterLevel {
+		labels = map[string]string{
+			constants.GenHostPortLabel:     constants.GenHostPortLabelValue,
+			constants.GenHostPortNameLabel: "test",
+			constants.LabelKeyOwner:        nodeName,
+		}
+	} else {
+		labels = map[string]string{
+			constants.LabelComponent: constants.ComponentWorker,
+			constants.LabelKeyOwner:  nodeName,
+		}
+	}
+	return corev1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      name,
+			Namespace: "default",
+			Labels:    labels,
+			Annotations: map[string]string{
+				constants.GenPortNumberAnnotation: fmt.Sprintf("%d", port),
+			},
+		},
+		Spec: corev1.PodSpec{
+			NodeName: nodeName,
+			Containers: []corev1.Container{
+				{
+					Name:  "test",
+					Image: "test-image",
+					Ports: []corev1.ContainerPort{
+						{
+							Name:          "test",
+							ContainerPort: 80,
+							HostPort:      port,
+						},
+					},
+				},
+			},
+		},
+	}
+}
+
+var _ = BeforeSuite(func() {
+	logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true)))
+
+	ctx, cancel = context.WithCancel(context.TODO())
+
+	By("bootstrapping test environment")
+	testEnv = &envtest.Environment{
+		CRDDirectoryPaths:     []string{filepath.Join("..", "..", "config", "crd", "bases")},
+		ErrorIfCRDPathMissing: false,
+		BinaryAssetsDirectory: filepath.Join("..", "..", "bin", "k8s",
+			fmt.Sprintf("1.31.0-%s-%s", runtime.GOOS, runtime.GOARCH)),
+	}
+
+	var err error
+	// cfg is defined in this file globally.
+	cfg, err = testEnv.Start()
+	Expect(err).NotTo(HaveOccurred())
+	Expect(cfg).NotTo(BeNil())
+
+	err = tfv1.AddToScheme(scheme.Scheme)
+	Expect(err).NotTo(HaveOccurred())
+
+	err = corev1.AddToScheme(scheme.Scheme)
+	Expect(err).NotTo(HaveOccurred())
+
+	// Create a Kubernetes client
+	k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme})
+	Expect(err).NotTo(HaveOccurred())
+	Expect(k8sClient).NotTo(BeNil())
+
+	mgr, err = ctrl.NewManager(cfg, ctrl.Options{
+		Scheme: scheme.Scheme,
+		Metrics: metricsserver.Options{
+			BindAddress: "0",
+		},
+	})
+	Expect(err).NotTo(HaveOccurred())
+
+	// Create test GPUs with metadata only first
+	workers := []corev1.Pod{
+		genHostPortPod("worker-1", "node-1", 40000, false),
+		genHostPortPod("worker-2", "node-1", 40001, false),
+		genHostPortPod("worker-3", "node-1", 40127, false),
+		genHostPortPod("worker-4", "node-2", 40003, false),
+		genHostPortPod("worker-5", "node-2", 40065, false),
+		genHostPortPod("lab-1", "node-1", 42001, true),
+		genHostPortPod("lab-2", "node-1", 59999, true),
+	}
+
+	// First create the GPUs without status
+	for i := range workers {
+		err = k8sClient.Create(ctx, &workers[i])
+		Expect(err).NotTo(HaveOccurred())
+	}
+
+	pa, err = NewPortAllocator(ctx, k8sClient, "40000-42000", "42001-60000")
+	Expect(err).NotTo(HaveOccurred())
+	readyCh := pa.SetupWithManager(ctx, mgr)
+	Expect(err).NotTo(HaveOccurred())
+
+	go func() {
+		defer GinkgoRecover()
+		err = mgr.Start(ctx)
+		Expect(err).ToNot(HaveOccurred(), "failed to run manager")
+	}()
+	<-readyCh
+})
+
+var _ = AfterSuite(func() {
+	By("tearing down the test environment")
+	cancel()
+	err := testEnv.Stop()
+	Expect(err).NotTo(HaveOccurred())
+})
diff --git a/internal/portallocator/portallocator_test.go b/internal/portallocator/portallocator_test.go
new file mode 100644
index 00000000..cccd58f0
--- /dev/null
+++ b/internal/portallocator/portallocator_test.go
@@ -0,0 +1,166 @@
+package portallocator
+
+import (
+	"fmt"
+	"strconv"
+	"sync"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("Port Allocator", func() {
+	BeforeEach(func() {
+		// Reset state before each test
+		// This is important to ensure tests don't interfere with each other
+		// We're using the existing pa instance from the suite setup
+	})
+
+	Context("AssignHostPort", func() {
+		It("should assign a valid port for a node", func() {
+			port, err := pa.AssignHostPort("node-1")
+			Expect(err).NotTo(HaveOccurred())
+			Expect(port).To(Equal(40002))
+
+			port, err = pa.AssignHostPort("node-1")
+			Expect(err).NotTo(HaveOccurred())
+			Expect(port).To(Equal(40003))
+
+			err = pa.ReleaseHostPort("node-1", 40002)
+			Expect(err).NotTo(HaveOccurred())
+
+			port, err = pa.AssignHostPort("node-1")
+			Expect(err).NotTo(HaveOccurred())
+			Expect(port).To(Equal(40002))
+
+			port, err = pa.AssignHostPort("node-new")
+			Expect(err).NotTo(HaveOccurred())
+			Expect(port).To(Equal(40000))
+		})
+
+		It("should fail when node name is empty", func() {
+			_, err := pa.AssignHostPort("")
+			Expect(err).To(HaveOccurred())
+			Expect(err.Error()).To(ContainSubstring("node name cannot be empty"))
+		})
+
+		It("should exhaust ports and return error when no ports available", func() {
+			// Create a node with a small port range for testing exhaustion
+			nodeName := "exhaust-test-node"
+
+			// Assign ports until we get an error
+			var lastPort int
+			var err error
+			assignedPorts := make(map[int]bool)
+
+			// Keep assigning ports until we get an error or hit a reasonable limit
+			for i := 0; i < 2002; i++ {
+				lastPort, err = pa.AssignHostPort(nodeName)
+				if err != nil {
+					break
+				}
+
+				// Verify we don't get duplicate ports
+				Expect(assignedPorts).NotTo(HaveKey(lastPort))
+				assignedPorts[lastPort] = true
+			}
+
+			Expect(err).To(HaveOccurred())
+			Expect(err.Error()).To(ContainSubstring("no available port"))
+		})
+	})
+
+	Context("ReleaseHostPort", func() {
+		It("should release a port successfully", func() {
+			nodeName := "release-test-node"
+			port, err := pa.AssignHostPort(nodeName)
+			Expect(err).NotTo(HaveOccurred())
+
+			err = pa.ReleaseHostPort(nodeName, port)
+			Expect(err).NotTo(HaveOccurred())
+		})
+
+		It("should fail to release ports with invalid parameters", func() {
+			tests := []struct {
+				description string
+				node        string
+				port        int
+				errorMsg    string
+			}{
+				{"invalid node name", "invalid-node-name", 40001, "node invalid-node-name not found"},
+				{"port is zero", "node-1", 0, "port cannot be 0 when release host port"},
+			}
+
+			for _, tc := range tests {
+				By(tc.description)
+				err := pa.ReleaseHostPort(tc.node, tc.port)
+				Expect(err).To(HaveOccurred())
+				if tc.errorMsg != "" {
+					Expect(err.Error()).To(ContainSubstring(tc.errorMsg))
+				}
+			}
+		})
+	})
+
+	Context("Cluster Level Port Allocation", func() {
+		It("should assign and release cluster level ports", func() {
+			podName := "test-cluster-pod"
+			port, err := pa.AssignClusterLevelHostPort(podName)
+			Expect(err).NotTo(HaveOccurred())
+			Expect(port).To(Equal(42002))
+
+			err = pa.ReleaseClusterLevelHostPort(podName, port)
+			Expect(err).NotTo(HaveOccurred())
+
+			err = pa.ReleaseClusterLevelHostPort(podName, 59999)
+			Expect(err).NotTo(HaveOccurred())
+
+			port, err = pa.AssignClusterLevelHostPort(podName)
+			Expect(err).NotTo(HaveOccurred())
+			Expect(port).To(Equal(42002))
+		})
+
+		It("should fail to release a cluster port with invalid parameters", func() {
+			err := pa.ReleaseClusterLevelHostPort("test-pod", 0)
+			Expect(err).To(HaveOccurred())
+			Expect(err.Error()).To(ContainSubstring("port cannot be 0 when release host port"))
+		})
+	})
+
+	Context("Concurrency", func() {
+		It("should handle concurrent port assignments and releases", func() {
+			const workers = 20
+			var wg sync.WaitGroup
+			results := make(chan error, workers)
+
+			wg.Add(workers)
+			for i := 0; i < workers; i++ {
+				go func(i int) {
+					defer wg.Done()
+					node := "concurrent-node-" + strconv.Itoa(i%5)
+					_, err := pa.AssignHostPort(node)
+					if err != nil {
+						results <- fmt.Errorf("assignment failed: %v", err)
+						return
+					}
+				}(i)
+			}
+
+			// Wait for all goroutines to complete
+			wg.Wait()
+
+			for i := 0; i < 5; i++ {
+				bitMap := pa.BitmapPerNode["concurrent-node-"+strconv.Itoa(i)]
+				Expect(bitMap).To(HaveLen(32))
+				Expect(bitMap[0]).To(Equal(uint64(0xf)))
+			}
+
+			close(results)
+
+			// Check for any errors
+			for err := range results {
+				Expect(err).NotTo(HaveOccurred())
+			}
+		})
+	})
+})
diff --git a/internal/server/router/assign_host_port.go b/internal/server/router/assign_host_port.go
new file mode 100644
index 00000000..8633c9c6
--- /dev/null
+++ b/internal/server/router/assign_host_port.go
@@ -0,0 +1,33 @@
+package router
+
+import (
+	"context"
+	"fmt"
+	"net/http"
+
+	"github.com/NexusGPU/tensor-fusion/internal/portallocator"
+	"github.com/gin-gonic/gin"
+	"sigs.k8s.io/controller-runtime/pkg/log"
+)
+
+type AssignHostPortRouter struct {
+	allocator *portallocator.PortAllocator
+}
+
+func NewAssignHostPortRouter(ctx context.Context, allocator *portallocator.PortAllocator) (*AssignHostPortRouter, error) {
+	return &AssignHostPortRouter{allocator: allocator}, nil
+}
+
+func (r *AssignHostPortRouter) AssignHostPort(ctx *gin.Context) {
+	// TODO verify service account token, issuer must be the same as current instance
+	// namely the request must comes from peer operator Pod
+
+	podName := ctx.Query("podName")
+	port, err := r.allocator.AssignClusterLevelHostPort(podName)
+	if err != nil {
+		ctx.String(http.StatusInternalServerError, err.Error())
+		return
+	}
+	log.FromContext(ctx).Info("assigned host port", "podName", podName, "port", port)
+	ctx.String(http.StatusOK, fmt.Sprintf("%d", port))
+}
diff --git a/internal/server/router/connection.go b/internal/server/router/connection.go
index 5008048d..7b3a7661 100644
--- a/internal/server/router/connection.go
+++ b/internal/server/router/connection.go
@@ -117,11 +117,13 @@ func (cw *connectionWatcher) subscribe(req types.NamespacedName) (connectionChan
 func (cw *connectionWatcher) watchConnections(ctx context.Context, watcher watch.Interface) {
 	// Watch for changes
 	defer watcher.Stop()
+	watcherChan := watcher.ResultChan()
 	for {
+
 		select {
 		case <-ctx.Done():
 			return
-		case event, ok := <-watcher.ResultChan():
+		case event, ok := <-watcherChan:
 			if !ok {
 				return
 			}
diff --git a/internal/server/server.go b/internal/server/server.go
index 040a94c6..816d7c3f 100644
--- a/internal/server/server.go
+++ b/internal/server/server.go
@@ -8,6 +8,7 @@ import (
 
 func NewHTTPServer(
 	cr *router.ConnectionRouter,
+	ahp *router.AssignHostPortRouter,
 ) *gin.Engine {
 
 	r := gin.New()
@@ -17,5 +18,6 @@ func NewHTTPServer(
 
 	apiGroup := r.Group("/api")
 	apiGroup.GET("/connection", cr.Get)
+	apiGroup.POST("/assign-host-port", ahp.AssignHostPort)
 	return r
 }
diff --git a/internal/utils/net.go b/internal/utils/net.go
new file mode 100644
index 00000000..c8940c04
--- /dev/null
+++ b/internal/utils/net.go
@@ -0,0 +1,37 @@
+package utils
+
+import "net"
+
+func CurrentIP() string {
+	interfaces, err := net.Interfaces()
+	if err != nil {
+		panic(err)
+	}
+
+	for _, iface := range interfaces {
+		if iface.Flags&net.FlagUp == 0 || iface.Flags&net.FlagLoopback != 0 {
+			continue
+		}
+
+		addrs, err := iface.Addrs()
+		if err != nil {
+			continue
+		}
+
+		for _, addr := range addrs {
+			ipNet, ok := addr.(*net.IPNet)
+			if !ok {
+				continue
+			}
+
+			ip := ipNet.IP
+			if ip.IsLoopback() || ip.To4() == nil {
+				continue
+			}
+
+			return ip.String()
+		}
+	}
+
+	panic("no internal IP address found")
+}
diff --git a/internal/utils/reconcile.go b/internal/utils/reconcile.go
index bcd80239..eb7da3b7 100644
--- a/internal/utils/reconcile.go
+++ b/internal/utils/reconcile.go
@@ -29,6 +29,15 @@ var ErrNextLoop = errors.New("stop this loop and return the associated Result ob
 // ErrTerminateLoop is not a real error. It forces the current reconciliation loop to stop
 var ErrTerminateLoop = errors.New("stop this loop and do not requeue")
 
+// Minimum time between reconciliations for the same object
+var debounceInterval = 3 * time.Second
+
+func init() {
+	if os.Getenv("GO_TESTING") == "true" {
+		debounceInterval = 60 * time.Millisecond
+	}
+}
+
 // HandleFinalizer ensures proper finalizer management for Kubernetes resources.
 // It automatically adds the finalizer when needed, and removes it after successful cleanup.
 // Returns (shouldReturn, err):
@@ -147,10 +156,6 @@ func CompareAndGetObjectHash(hash string, obj ...any) (bool, string) {
 const DebounceKeySuffix = ":in_queue"
 
 func DebouncedReconcileCheck(ctx context.Context, lastProcessedItems *sync.Map, name types.NamespacedName) (runNow bool, alreadyQueued bool, waitTime time.Duration) {
-	const (
-		// Minimum time between reconciliations for the same object
-		debounceInterval = 3 * time.Second
-	)
 	now := time.Now()
 	key := name.String()
 	inQueueKey := key + DebounceKeySuffix
diff --git a/internal/webhook/v1/pod_webhook.go b/internal/webhook/v1/pod_webhook.go
index cd460220..57bb13eb 100644
--- a/internal/webhook/v1/pod_webhook.go
+++ b/internal/webhook/v1/pod_webhook.go
@@ -19,8 +19,12 @@ package v1
 import (
 	"context"
 	"encoding/json"
+	goErrors "errors"
 	"fmt"
+	"io"
 	"net/http"
+	"strconv"
+	"time"
 
 	"gomodules.xyz/jsonpatch/v2"
 	corev1 "k8s.io/api/core/v1"
@@ -36,6 +40,7 @@ import (
 	"al.essio.dev/pkg/shellescape"
 	tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
 	"github.com/NexusGPU/tensor-fusion/internal/constants"
+	"github.com/NexusGPU/tensor-fusion/internal/portallocator"
 	"github.com/NexusGPU/tensor-fusion/internal/utils"
 	"github.com/NexusGPU/tensor-fusion/internal/worker"
 	"github.com/lithammer/shortuuid/v4"
@@ -43,22 +48,24 @@ import (
 )
 
 // SetupPodWebhookWithManager registers the webhook for Pod in the manager.
-func SetupPodWebhookWithManager(mgr ctrl.Manager) error {
+func SetupPodWebhookWithManager(mgr ctrl.Manager, portAllocator *portallocator.PortAllocator) error {
 	webhookServer := mgr.GetWebhookServer()
 
 	webhookServer.Register("/mutate-v1-pod",
 		&admission.Webhook{
 			Handler: &TensorFusionPodMutator{
-				decoder: admission.NewDecoder(runtime.NewScheme()),
-				Client:  mgr.GetClient(),
+				decoder:       admission.NewDecoder(runtime.NewScheme()),
+				Client:        mgr.GetClient(),
+				portAllocator: portAllocator,
 			},
 		})
 	return nil
 }
 
 type TensorFusionPodMutator struct {
-	Client  client.Client
-	decoder admission.Decoder
+	Client        client.Client
+	decoder       admission.Decoder
+	portAllocator *portallocator.PortAllocator
 }
 
 // Handle implements admission.Handler interface.
@@ -97,18 +104,18 @@ func (m *TensorFusionPodMutator) Handle(ctx context.Context, req admission.Reque
 		podCounterAnnotationKey = podCounterKey
 	}
 
+	pool := &tfv1.GPUPool{}
+	if err := m.Client.Get(ctx, client.ObjectKey{Name: tfInfo.Profile.PoolName}, pool); err != nil {
+		return admission.Errored(http.StatusInternalServerError, fmt.Errorf("gpu pool(%s) does not exist", tfInfo.Profile.PoolName))
+	}
+
 	workload := &tfv1.TensorFusionWorkload{}
 	if tfInfo.GenWorkload {
-		if err := m.createOrUpdateWorkload(ctx, pod, &tfInfo, workload); err != nil {
+		if err := m.createOrUpdateWorkload(ctx, pod, &tfInfo, workload, pool); err != nil {
 			return admission.Errored(http.StatusInternalServerError, fmt.Errorf("create tf workload: %w", err))
 		}
 	}
 
-	pool := &tfv1.GPUPool{}
-	if err := m.Client.Get(ctx, client.ObjectKey{Name: tfInfo.Profile.PoolName}, pool); err != nil {
-		return admission.Errored(http.StatusInternalServerError, fmt.Errorf("gpu pool(%s) does not exist", tfInfo.Profile.PoolName))
-	}
-
 	var nodeSelector map[string]string
 	if tfInfo.Profile.IsLocalGPU {
 		if !tfInfo.GenWorkload {
@@ -116,12 +123,26 @@ func (m *TensorFusionPodMutator) Handle(ctx context.Context, req admission.Reque
 				return admission.Errored(http.StatusInternalServerError, fmt.Errorf("workload(%s) does not exist", tfInfo.WorkloadName))
 			}
 		}
-		workloadStatus, err := worker.SelectWorker(ctx, m.Client, workload, 1)
-		if err != nil {
-			log.Error(err, "failed to select worker for pod", "pod", req.Name, "namespace", req.Namespace)
-			return admission.Errored(http.StatusInternalServerError, fmt.Errorf("select worker: %w", err))
+
+		workerFound := false
+		for i := 0; i < 25; i++ {
+			workloadStatus, err := worker.SelectWorker(ctx, m.Client, workload, 1)
+			if err != nil {
+				if goErrors.Is(err, worker.ErrNoAvailableWorker) {
+					time.Sleep(time.Second)
+					continue
+				}
+				log.Error(err, "failed to select worker for pod", "pod", req.Name, "namespace", req.Namespace)
+				return admission.Errored(http.StatusInternalServerError, fmt.Errorf("select worker: %w", err))
+			}
+			nodeSelector = workloadStatus.NodeSelector
+			workerFound = true
+			break
+		}
+
+		if !workerFound {
+			return admission.Errored(http.StatusInternalServerError, fmt.Errorf("no available worker for pod: %s", req.Name))
 		}
-		nodeSelector = workloadStatus.NodeSelector
 	}
 
 	// Inject initContainer and env variables
@@ -153,10 +174,12 @@ func (m *TensorFusionPodMutator) InjectDecoder(d admission.Decoder) error {
 	return nil
 }
 
-func (m *TensorFusionPodMutator) createOrUpdateWorkload(ctx context.Context, pod *corev1.Pod, tfInfo *TensorFusionInfo, workload *tfv1.TensorFusionWorkload) error {
+func (m *TensorFusionPodMutator) createOrUpdateWorkload(ctx context.Context, pod *corev1.Pod, tfInfo *TensorFusionInfo, workload *tfv1.TensorFusionWorkload, pool *tfv1.GPUPool) error {
 	// Check if workload exists
 	err := m.Client.Get(ctx, client.ObjectKey{Name: tfInfo.WorkloadName, Namespace: pod.Namespace}, workload)
 
+	qos := calculateQoSLevel(tfInfo.Profile, pool)
+
 	if err != nil {
 		if !errors.IsNotFound(err) {
 			return fmt.Errorf("failed to get workload: %w", err)
@@ -182,7 +205,7 @@ func (m *TensorFusionPodMutator) createOrUpdateWorkload(ctx context.Context, pod
 				PoolName:   tfInfo.Profile.PoolName,
 				Resources:  tfInfo.Profile.Resources,
 				GPUCount:   tfInfo.Profile.GPUCount,
-				Qos:        tfInfo.Profile.Qos,
+				Qos:        qos,
 				GPUModel:   tfInfo.Profile.GPUModel,
 				IsLocalGPU: tfInfo.Profile.IsLocalGPU,
 			},
@@ -210,7 +233,7 @@ func (m *TensorFusionPodMutator) createOrUpdateWorkload(ctx context.Context, pod
 		Replicas:   &replicas,
 		PoolName:   tfInfo.Profile.PoolName,
 		Resources:  tfInfo.Profile.Resources,
-		Qos:        tfInfo.Profile.Qos,
+		Qos:        qos,
 		IsLocalGPU: tfInfo.Profile.IsLocalGPU,
 		GPUCount:   tfInfo.Profile.GPUCount,
 		GPUModel:   tfInfo.Profile.GPUModel,
@@ -266,8 +289,16 @@ func (m *TensorFusionPodMutator) patchTFClient(
 		pod.Labels = map[string]string{}
 	}
 	pod.Labels[constants.LabelKeyPodTemplateHash] = utils.GetObjectHash(clientConfig)
+	pod.Labels[constants.LabelComponent] = constants.ComponentClient
 	pod.Labels[constants.GpuPoolKey] = pool.Name
 
+	// Patch hostPort allocation
+	if pod.Labels[constants.GenHostPortLabel] == constants.GenHostPortLabelValue {
+		if err := m.generateHostPort(pod, pod.Labels[constants.GenHostPortNameLabel]); err != nil {
+			return nil, fmt.Errorf("can not generate host port: %w", err)
+		}
+	}
+
 	containerPatched := false
 	// Patch to Container
 	for _, name := range containerNames {
@@ -374,3 +405,93 @@ func (m *TensorFusionPodMutator) patchTFClient(
 	patches = append(patches, strategicpatches...)
 	return patches, nil
 }
+
+func (m *TensorFusionPodMutator) generateHostPort(pod *corev1.Pod, portName string) error {
+
+	portNameFound := false
+	containerIndex := -1
+	portIndex := -1
+	for i := range pod.Spec.Containers {
+		container := &pod.Spec.Containers[i]
+		for j := range container.Ports {
+			port := &container.Ports[j]
+			if port.Name == portName {
+				portNameFound = true
+				containerIndex = i
+				portIndex = j
+			}
+		}
+	}
+	if !portNameFound {
+		return fmt.Errorf("port name %s not found, can not assign host port for pod %s", portName, pod.Name)
+	}
+
+	if !m.portAllocator.IsLeader {
+		port, err := m.assignClusterHostPortFromLeader(pod)
+		if err != nil {
+			return fmt.Errorf("can not assign cluster host port from leader: %w", err)
+		}
+		pod.Annotations[constants.GenPortNumberAnnotation] = strconv.Itoa(port)
+	} else {
+		port, err := m.portAllocator.AssignClusterLevelHostPort(pod.Name)
+		if err != nil {
+			return fmt.Errorf("can not assign cluster level host port: %w", err)
+		}
+		pod.Annotations[constants.GenPortNumberAnnotation] = strconv.Itoa(port)
+	}
+
+	pod.Spec.Containers[containerIndex].Ports[portIndex].HostPort = int32(m.getPortNumber(pod))
+	return nil
+}
+
+func (m *TensorFusionPodMutator) getPortNumber(pod *corev1.Pod) int {
+	portNumber, _ := strconv.Atoi(pod.Annotations[constants.GenPortNumberAnnotation])
+	return portNumber
+}
+
+func (m *TensorFusionPodMutator) assignClusterHostPortFromLeader(pod *corev1.Pod) (int, error) {
+	client := &http.Client{Timeout: 10 * time.Second}
+	leaderIP := m.portAllocator.GetLeaderIP()
+	if leaderIP == "" {
+		return 0, fmt.Errorf("operator leader IP not found")
+	}
+
+	url := fmt.Sprintf("http://%s:8080/assign-host-port?podName=%s", leaderIP, pod.Name)
+	resp, err := client.Get(url)
+	if err != nil {
+		return 0, fmt.Errorf("failed to assign host port: %w", err)
+	}
+	defer func() {
+		_ = resp.Body.Close()
+	}()
+
+	if resp.StatusCode != http.StatusOK {
+		return 0, fmt.Errorf("host port allocation failed: %s", resp.Status)
+	}
+
+	body, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return 0, fmt.Errorf("failed to read allocation response: %w", err)
+	}
+
+	return strconv.Atoi(string(body))
+}
+
+func calculateQoSLevel(profile *tfv1.WorkloadProfileSpec, pool *tfv1.GPUPool) tfv1.QoSLevel {
+	sameReqLimits := profile.Resources.Limits.Tflops.Value() == profile.Resources.Requests.Tflops.Value() &&
+		profile.Resources.Limits.Vram.Value() == profile.Resources.Requests.Vram.Value()
+
+	// set to critical if req == limits, same logic as Kubernetes QoS
+	if sameReqLimits {
+		return constants.QoSLevelCritical
+	}
+
+	// when not set, assign default QoS
+	if profile.Qos == "" {
+		if pool.Spec.QosConfig == nil || pool.Spec.QosConfig.DefaultQoS == "" {
+			return constants.QoSLevelMedium
+		}
+		return pool.Spec.QosConfig.DefaultQoS
+	}
+	return profile.Qos
+}
diff --git a/internal/webhook/v1/webhook_suite_test.go b/internal/webhook/v1/webhook_suite_test.go
index 25eb459f..4e5d369b 100644
--- a/internal/webhook/v1/webhook_suite_test.go
+++ b/internal/webhook/v1/webhook_suite_test.go
@@ -28,6 +28,7 @@ import (
 
 	tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
 	"github.com/NexusGPU/tensor-fusion/internal/config"
+	"github.com/NexusGPU/tensor-fusion/internal/portallocator"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 
@@ -61,7 +62,6 @@ var (
 
 func TestAPIs(t *testing.T) {
 	RegisterFailHandler(Fail)
-
 	RunSpecs(t, "Webhook Suite")
 }
 
@@ -134,7 +134,11 @@ var _ = BeforeSuite(func() {
 	})
 	Expect(err).NotTo(HaveOccurred())
 
-	err = SetupPodWebhookWithManager(mgr)
+	err = SetupPodWebhookWithManager(mgr, &portallocator.PortAllocator{
+		PortRangeStartCluster: 42000,
+		PortRangeEndCluster:   62000,
+		BitmapCluster:         make([]uint64, (62000-42000)/64+1),
+	})
 	Expect(err).NotTo(HaveOccurred())
 
 	// +kubebuilder:scaffold:webhook
diff --git a/internal/worker/worker.go b/internal/worker/worker.go
index 1d447176..6bb56569 100644
--- a/internal/worker/worker.go
+++ b/internal/worker/worker.go
@@ -3,34 +3,37 @@ package worker
 import (
 	"context"
 	"encoding/json"
+	"errors"
 	"fmt"
 	"math"
 	"strconv"
 	"strings"
-	"time"
 
 	tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
 	"github.com/NexusGPU/tensor-fusion/internal/config"
 	"github.com/NexusGPU/tensor-fusion/internal/constants"
 	"github.com/NexusGPU/tensor-fusion/internal/utils"
 	"github.com/samber/lo"
-	"golang.org/x/exp/rand"
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 )
 
-func init() {
-	rand.Seed(uint64(time.Now().UnixNano()))
-}
-
 type WorkerGenerator struct {
 	GpuInfos     *[]config.GpuInfo
 	WorkerConfig *tfv1.WorkerConfig
 }
 
+var ErrNoAvailableWorker = errors.New("no available worker")
+
 func (wg *WorkerGenerator) WorkerPort(pod *corev1.Pod) (int, error) {
-	port, ok := lo.Find(pod.Spec.Containers[0].Env, func(env corev1.EnvVar) bool {
+	portAnnotation, ok := pod.Annotations[constants.GenPortNumberAnnotation]
+	if ok {
+		return strconv.Atoi(portAnnotation)
+	}
+
+	// Compatible with old version in which no annotation in worker Pod
+	portEnv, ok := lo.Find(pod.Spec.Containers[0].Env, func(env corev1.EnvVar) bool {
 		return env.Name == constants.WorkerPortEnv
 	})
 
@@ -38,13 +41,7 @@ func (wg *WorkerGenerator) WorkerPort(pod *corev1.Pod) (int, error) {
 		return 0, fmt.Errorf("worker port not found in pod %s", pod.Name)
 	}
 
-	return strconv.Atoi(port.Value)
-}
-
-func (wg *WorkerGenerator) AllocPort() int {
-	min := 30000
-	max := 65535
-	return rand.Intn(max-min+1) + min
+	return strconv.Atoi(portEnv.Value)
 }
 
 func (wg *WorkerGenerator) PodTemplateHash(workloadSpec any) (string, error) {
@@ -61,6 +58,7 @@ func (wg *WorkerGenerator) GenerateWorkerPod(
 	generateName string,
 	namespace string,
 	port int,
+	requests tfv1.Resource,
 	limits tfv1.Resource,
 	podTemplateHash string,
 ) (*corev1.Pod, string, error) {
@@ -146,11 +144,32 @@ func (wg *WorkerGenerator) GenerateWorkerPod(
 			},
 		},
 	})
-
+	workerLabels := map[string]string{
+		constants.LabelComponent: constants.ComponentWorker,
+	}
+	if podTmpl.Template.Labels != nil {
+		for k, v := range podTmpl.Template.Labels {
+			workerLabels[k] = v
+		}
+	}
+	workerAnnotations := map[string]string{
+		constants.TFLOPSRequestAnnotation: requests.Tflops.String(),
+		constants.TFLOPSLimitAnnotation:   limits.Tflops.String(),
+		constants.VRAMRequestAnnotation:   requests.Vram.String(),
+		constants.VRAMLimitAnnotation:     limits.Vram.String(),
+		constants.GenPortNumberAnnotation: strconv.Itoa(port),
+	}
+	if podTmpl.Template.Annotations != nil {
+		for k, v := range podTmpl.Template.Annotations {
+			workerAnnotations[k] = v
+		}
+	}
 	return &corev1.Pod{
 		ObjectMeta: metav1.ObjectMeta{
 			GenerateName: generateName,
 			Namespace:    namespace,
+			Labels:       workerLabels,
+			Annotations:  workerAnnotations,
 		},
 		Spec: spec,
 	}, podTemplateHash, nil
@@ -163,7 +182,7 @@ func SelectWorker(
 	maxSkew int32,
 ) (*tfv1.WorkerStatus, error) {
 	if len(workload.Status.WorkerStatuses) == 0 {
-		return nil, fmt.Errorf("no available worker")
+		return nil, ErrNoAvailableWorker
 	}
 
 	usageMapping := lo.SliceToMap(workload.Status.WorkerStatuses, func(status tfv1.WorkerStatus) (string, int) {