diff --git a/.gitignore b/.gitignore index 97a779a7..d1783912 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ Dockerfile.cross # Output of the go coverage tool, specifically when used with LiteIDE *.out +cover.out.* # Go workspace file go.work diff --git a/.vscode/launch.json b/.vscode/launch.json index e34c9fad..f14afdd2 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -55,7 +55,10 @@ "type": "go", "request": "launch", "mode": "test", - "program": "${workspaceFolder}", + "env": { + "GO_TESTING": "true" + }, + "program": "${workspaceFolder}/internal/controller", "console": "integratedTerminal" } ] diff --git a/.vscode/settings.json b/.vscode/settings.json index ec8decc1..4a4e6403 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,12 +1,15 @@ { "cSpell.words": [ + "alertmanager", "alicloud", "Aliyun", "AMDCDNA", "AMDRDNA", "apimachinery", + "automount", "AWSGPU", "batchv", + "burstable", "CDNA", "certificaterequests", "certmanager", @@ -39,6 +42,7 @@ "greptime", "greptimedb", "healthz", + "iface", "karpenter", "kubebuilder", "KUBECONFIG", @@ -51,6 +55,7 @@ "NVML", "omitempty", "onsi", + "portallocator", "printcolumn", "prometheusagents", "prometheuses", @@ -62,11 +67,13 @@ "schedulingconfigtemplates", "schedulingcorev", "shirou", + "strategicpatches", "subresource", "tensorfusion", "tensorfusionaiv", "tensorfusioncluster", "tensorfusionclusters", + "tensorfusionworkload", "Tera", "tflops", "Tmpl", diff --git a/Makefile b/Makefile index 28c3dd41..73c5441d 100644 --- a/Makefile +++ b/Makefile @@ -62,13 +62,8 @@ vet: ## Run go vet against code. .PHONY: test test: manifests generate fmt vet envtest ## Run tests. - KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $$(go list ./... | grep -v /e2e) -timeout 0 -coverprofile cover.out + KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" GO_TESTING=true go run github.com/onsi/ginkgo/v2/ginkgo -p -timeout 0 -cover -coverprofile cover.out -r --skip-file ./test/e2e -# TODO(user): To use a different vendor for e2e tests, modify the setup under 'tests/e2e'. -# The default setup assumes Kind is pre-installed and builds/loads the Manager Docker image locally. -# Prometheus and CertManager are installed by default; skip with: -# - PROMETHEUS_INSTALL_SKIP=true -# - CERT_MANAGER_INSTALL_SKIP=true .PHONY: test-e2e test-e2e: manifests generate fmt vet ## Run the e2e tests. Expected an isolated environment using Kind. @command -v kind >/dev/null 2>&1 || { \ diff --git a/api/v1/gpu_types.go b/api/v1/gpu_types.go index f42b1db5..5a073232 100644 --- a/api/v1/gpu_types.go +++ b/api/v1/gpu_types.go @@ -36,6 +36,18 @@ type GPUStatus struct { GPUModel string `json:"gpuModel"` Message string `json:"message"` + + // +optional + RunningApps []*RunningAppDetail `json:"runningApps,omitempty"` +} + +type RunningAppDetail struct { + // Workload name namespace + Name string `json:"name,omitempty"` + Namespace string `json:"namespace,omitempty"` + + // Worker count + Count int `json:"count"` } // +kubebuilder:validation:Enum=Pending;Provisioning;Running;Unknown;Destroying;Migrating diff --git a/api/v1/gpunode_funcs.go b/api/v1/gpunode_funcs.go index 39225880..21f73af9 100644 --- a/api/v1/gpunode_funcs.go +++ b/api/v1/gpunode_funcs.go @@ -12,7 +12,7 @@ func (node *GPUNode) InitializeStatus(initTFlops, initVRAM resource.Quantity, in TotalTFlops: initTFlops, TotalVRAM: initVRAM, TotalGPUs: initGPUs, - AllocationDetails: &[]GPUNodeAllocationDetails{}, + AllocationInfo: []*RunningAppDetail{}, LoadedModels: &[]string{}, ManagedGPUDeviceIDs: []string{}, ObservedGeneration: node.Generation, diff --git a/api/v1/gpunode_types.go b/api/v1/gpunode_types.go index 7423bf4a..a2a7ee08 100644 --- a/api/v1/gpunode_types.go +++ b/api/v1/gpunode_types.go @@ -94,20 +94,8 @@ type GPUNodeStatus struct { ObservedGeneration int64 `json:"observedGeneration,omitempty"` - // Allocation details is for node compaction, and calculate used apps // +optional - AllocationDetails *[]GPUNodeAllocationDetails `json:"allocationDetails,omitempty"` -} - -type GPUNodeAllocationDetails struct { - PodID string `json:"podID,omitempty"` - PodName string `json:"podName,omitempty"` - Namespace string `json:"namespace"` - WorkloadName string `json:"workload,omitempty"` - - Requests GPUResourceUnit `json:"requests"` - Limits GPUResourceUnit `json:"limits"` - QoS QoSLevel `json:"qos,omitempty"` + AllocationInfo []*RunningAppDetail `json:"allocationInfo,omitempty"` } // +kubebuilder:validation:Enum=Pending;Provisioning;Migrating;Running;Succeeded;Failed;Unknown;Destroying diff --git a/api/v1/gpupool_types.go b/api/v1/gpupool_types.go index db8b8989..bc7cd2bd 100644 --- a/api/v1/gpupool_types.go +++ b/api/v1/gpupool_types.go @@ -293,7 +293,7 @@ type QosPricing struct { Requests GPUResourcePricingUnit `json:"requests,omitempty"` - // Default requests and limitsOverRequests are same, indicates normal on-demand serverless GPU usage, in hands-on lab low QoS case, limitsOverRequests should be cheaper, for example Low QoS, ratio should be 0.5 + // Default requests and limitsOverRequests are same, indicates normal on-demand serverless GPU usage, in hands-on lab low QoS case, limitsOverRequests should be lower, so that user can get burstable GPU resources with very low cost // +kubebuilder:default="1" LimitsOverRequestsChargingRatio string `json:"limitsOverRequests,omitempty"` } @@ -372,6 +372,8 @@ type GPUPoolStatus struct { AvailableTFlops resource.Quantity `json:"availableTFlops"` AvailableVRAM resource.Quantity `json:"availableVRAM"` + RunningAppsCnt int32 `json:"runningAppsCnt,omitempty"` + // +optional VirtualAvailableTFlops *resource.Quantity `json:"virtualAvailableTFlops,omitempty"` // +optional diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go index 6d16045a..c20d35b8 100644 --- a/api/v1/zz_generated.deepcopy.go +++ b/api/v1/zz_generated.deepcopy.go @@ -448,23 +448,6 @@ func (in *GPUNode) DeepCopyObject() runtime.Object { return nil } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *GPUNodeAllocationDetails) DeepCopyInto(out *GPUNodeAllocationDetails) { - *out = *in - in.Requests.DeepCopyInto(&out.Requests) - in.Limits.DeepCopyInto(&out.Limits) -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeAllocationDetails. -func (in *GPUNodeAllocationDetails) DeepCopy() *GPUNodeAllocationDetails { - if in == nil { - return nil - } - out := new(GPUNodeAllocationDetails) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *GPUNodeClass) DeepCopyInto(out *GPUNodeClass) { *out = *in @@ -704,14 +687,14 @@ func (in *GPUNodeStatus) DeepCopyInto(out *GPUNodeStatus) { *out = make([]string, len(*in)) copy(*out, *in) } - if in.AllocationDetails != nil { - in, out := &in.AllocationDetails, &out.AllocationDetails - *out = new([]GPUNodeAllocationDetails) - if **in != nil { - in, out := *in, *out - *out = make([]GPUNodeAllocationDetails, len(*in)) - for i := range *in { - (*in)[i].DeepCopyInto(&(*out)[i]) + if in.AllocationInfo != nil { + in, out := &in.AllocationInfo, &out.AllocationInfo + *out = make([]*RunningAppDetail, len(*in)) + for i := range *in { + if (*in)[i] != nil { + in, out := &(*in)[i], &(*out)[i] + *out = new(RunningAppDetail) + **out = **in } } } @@ -961,6 +944,17 @@ func (in *GPUStatus) DeepCopyInto(out *GPUStatus) { (*out)[key] = val } } + if in.RunningApps != nil { + in, out := &in.RunningApps, &out.RunningApps + *out = make([]*RunningAppDetail, len(*in)) + for i := range *in { + if (*in)[i] != nil { + in, out := &(*in)[i], &(*out)[i] + *out = new(RunningAppDetail) + **out = **in + } + } + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUStatus. @@ -1597,6 +1591,21 @@ func (in *Resources) DeepCopy() *Resources { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RunningAppDetail) DeepCopyInto(out *RunningAppDetail) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RunningAppDetail. +func (in *RunningAppDetail) DeepCopy() *RunningAppDetail { + if in == nil { + return nil + } + out := new(RunningAppDetail) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ScaleToZero) DeepCopyInto(out *ScaleToZero) { *out = *in diff --git a/charts/tensor-fusion/Chart.yaml b/charts/tensor-fusion/Chart.yaml index 9fe41356..775a3726 100644 --- a/charts/tensor-fusion/Chart.yaml +++ b/charts/tensor-fusion/Chart.yaml @@ -15,10 +15,10 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 1.2.22 +version: 1.3.2 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. # It is recommended to use it with quotes. -appVersion: "1.12.1" +appVersion: "1.30.3" diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_gpunodes.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_gpunodes.yaml index 6cd2c886..c798cb3c 100644 --- a/charts/tensor-fusion/crds/tensor-fusion.ai_gpunodes.yaml +++ b/charts/tensor-fusion/crds/tensor-fusion.ai_gpunodes.yaml @@ -86,66 +86,19 @@ spec: status: description: GPUNodeStatus defines the observed state of GPUNode. properties: - allocationDetails: - description: Allocation details is for node compaction, and calculate - used apps + allocationInfo: items: properties: - limits: - properties: - tflops: - anyOf: - - type: integer - - type: string - description: Tera floating point operations per second - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - vram: - anyOf: - - type: integer - - type: string - description: VRAM is short for Video memory, namely GPU - RAM - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - type: object - namespace: - type: string - podID: - type: string - podName: - type: string - qos: - enum: - - low - - medium - - high - - critical + count: + description: Worker count + type: integer + name: + description: Workload name namespace type: string - requests: - properties: - tflops: - anyOf: - - type: integer - - type: string - description: Tera floating point operations per second - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - vram: - anyOf: - - type: integer - - type: string - description: VRAM is short for Video memory, namely GPU - RAM - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - type: object - workload: + namespace: type: string required: - - limits - - namespace - - requests + - count type: object type: array availableTFlops: diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_gpupools.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_gpupools.yaml index c2257300..781418d0 100644 --- a/charts/tensor-fusion/crds/tensor-fusion.ai_gpupools.yaml +++ b/charts/tensor-fusion/crds/tensor-fusion.ai_gpupools.yaml @@ -546,7 +546,8 @@ spec: description: Default requests and limitsOverRequests are same, indicates normal on-demand serverless GPU usage, in hands-on lab low QoS case, limitsOverRequests should - be cheaper, for example Low QoS, ratio should be 0.5 + be lower, so that user can get burstable GPU resources + with very low cost type: string qos: enum: @@ -704,6 +705,9 @@ spec: readyNodes: format: int32 type: integer + runningAppsCnt: + format: int32 + type: integer savedCostsPerMonth: type: string totalGPUs: diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_gpus.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_gpus.yaml index ace87bc4..09a14f86 100644 --- a/charts/tensor-fusion/crds/tensor-fusion.ai_gpus.yaml +++ b/charts/tensor-fusion/crds/tensor-fusion.ai_gpus.yaml @@ -116,6 +116,21 @@ spec: - Destroying - Migrating type: string + runningApps: + items: + properties: + count: + description: Worker count + type: integer + name: + description: Workload name namespace + type: string + namespace: + type: string + required: + - count + type: object + type: array uuid: type: string required: diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionclusters.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionclusters.yaml index b7bc95c5..3cb00209 100644 --- a/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionclusters.yaml +++ b/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionclusters.yaml @@ -650,8 +650,8 @@ spec: description: Default requests and limitsOverRequests are same, indicates normal on-demand serverless GPU usage, in hands-on lab low QoS case, limitsOverRequests - should be cheaper, for example Low QoS, ratio - should be 0.5 + should be lower, so that user can get burstable + GPU resources with very low cost type: string qos: enum: diff --git a/charts/tensor-fusion/templates/admission-webhooks/mutating-webhook.yaml b/charts/tensor-fusion/templates/admission-webhooks/mutating-webhook.yaml index 5ea83d82..581e4ce2 100644 --- a/charts/tensor-fusion/templates/admission-webhooks/mutating-webhook.yaml +++ b/charts/tensor-fusion/templates/admission-webhooks/mutating-webhook.yaml @@ -22,6 +22,7 @@ webhooks: resources: - pods sideEffects: None + timeoutSeconds: 30 objectSelector: matchExpressions: - key: tensor-fusion.ai/enabled diff --git a/charts/tensor-fusion/templates/controller-deployment.yaml b/charts/tensor-fusion/templates/controller-deployment.yaml index 92a57194..14301a66 100644 --- a/charts/tensor-fusion/templates/controller-deployment.yaml +++ b/charts/tensor-fusion/templates/controller-deployment.yaml @@ -4,6 +4,7 @@ metadata: name: {{ include "tensor-fusion.fullname" . }}-controller namespace: {{ include "tensor-fusion.namespace" . }} labels: + tensor-fusion.ai/component: operator {{- include "tensor-fusion.labels" . | nindent 4 }} spec: replicas: {{ .Values.controller.replicaCount }} @@ -12,6 +13,7 @@ spec: app.kubernetes.io/name: {{ include "tensor-fusion.name" . }} app.kubernetes.io/instance: {{ .Release.Name }} app.kubernetes.io/component: controller + tensor-fusion.ai/component: operator template: metadata: {{- with .Values.controller.podAnnotations }} @@ -22,6 +24,7 @@ spec: app.kubernetes.io/name: {{ include "tensor-fusion.name" . }} app.kubernetes.io/instance: {{ .Release.Name }} app.kubernetes.io/component: controller + tensor-fusion.ai/component: operator spec: {{- with .Values.imagePullSecrets }} imagePullSecrets: @@ -35,6 +38,7 @@ spec: - /manager - -metrics-bind-address - :9000 + - -leader-elect livenessProbe: {{- toYaml .Values.controller.livenessProbe | nindent 12 }} readinessProbe: @@ -54,6 +58,8 @@ spec: - name: cert readOnly: true mountPath: /tmp/k8s-webhook-server/serving-certs + - name: logs + mountPath: /logs - name: cloud-vendor-credentials mountPath: /tmp/secret readOnly: true @@ -85,12 +91,17 @@ spec: requests: cpu: 50m memory: 64Mi + limits: + cpu: 1000m + memory: 512Mi volumeMounts: - name: logs mountPath: /logs - name: vector-config mountPath: /etc/vector/vector.yaml subPath: vector-operator.yaml + - name: kubernetes-logs + mountPath: /var/log/pods volumes: - name: cert secret: @@ -115,6 +126,9 @@ spec: name: gpu-info - name: logs emptyDir: {} + - name: kubernetes-logs + hostPath: + path: /var/log/pods {{- with .Values.controller.affinity }} affinity: {{- toYaml . | nindent 8 }} diff --git a/charts/tensor-fusion/templates/gpu-public-gpu-info.yaml b/charts/tensor-fusion/templates/gpu-public-gpu-info.yaml index f39c3798..23b2cb4a 100644 --- a/charts/tensor-fusion/templates/gpu-public-gpu-info.yaml +++ b/charts/tensor-fusion/templates/gpu-public-gpu-info.yaml @@ -432,6 +432,12 @@ data: costPerHour: 0.26 fp16TFlops: 163 + - model: RTX2000Ada + fullModelName: "NVIDIA RTX 2000 Ada Generation" + vendor: NVIDIA + costPerHour: 0.23 + fp16TFlops: 46 + # NVIDIA GTX Series - model: GTX1050Ti fullModelName: "NVIDIA GeForce GTX 1050 Ti" diff --git a/charts/tensor-fusion/templates/greptime-standalone.yaml b/charts/tensor-fusion/templates/greptime-standalone.yaml index 58810885..5f13c9dc 100644 --- a/charts/tensor-fusion/templates/greptime-standalone.yaml +++ b/charts/tensor-fusion/templates/greptime-standalone.yaml @@ -61,15 +61,18 @@ metadata: namespace: greptimedb labels: app.greptime.io/component: greptimedb-standalone + tensor-fusion.ai/component: greptimedb spec: replicas: 1 selector: matchLabels: app.greptime.io/component: greptimedb-standalone + tensor-fusion.ai/component: greptimedb template: metadata: labels: app.greptime.io/component: greptimedb-standalone + tensor-fusion.ai/component: greptimedb spec: volumes: - name: logs diff --git a/charts/tensor-fusion/templates/rbac-hypervisor.yaml b/charts/tensor-fusion/templates/rbac-hypervisor.yaml new file mode 100644 index 00000000..5ce23a05 --- /dev/null +++ b/charts/tensor-fusion/templates/rbac-hypervisor.yaml @@ -0,0 +1,14 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: tensor-fusion-hypervisor-role +rules: +- apiGroups: + - "" + resources: + - nodes + - pods + verbs: + - get + - list + - watch diff --git a/charts/tensor-fusion/templates/rbac.yaml b/charts/tensor-fusion/templates/rbac.yaml index 523bf956..f2ae925b 100644 --- a/charts/tensor-fusion/templates/rbac.yaml +++ b/charts/tensor-fusion/templates/rbac.yaml @@ -104,6 +104,18 @@ rules: - patch - update - watch +- apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - get + - list + - watch + - create + - update + - patch + - delete - apiGroups: - tensor-fusion.ai resources: diff --git a/charts/tensor-fusion/templates/serviceaccount-hypervisor.yaml b/charts/tensor-fusion/templates/serviceaccount-hypervisor.yaml new file mode 100644 index 00000000..1c117826 --- /dev/null +++ b/charts/tensor-fusion/templates/serviceaccount-hypervisor.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + # Service account for watch vGPU worker auto scaling event and collect Pod log metadata + # The name is fixed and only needs pods/nodes read permission + name: tensor-fusion-hypervisor-sa + namespace: {{ include "tensor-fusion.namespace" . }} + labels: + {{- include "tensor-fusion.labels" . | nindent 4 }} +automountServiceAccountToken: true \ No newline at end of file diff --git a/charts/tensor-fusion/templates/vector-config.yaml b/charts/tensor-fusion/templates/vector-config.yaml index 2479a4a7..406de6eb 100644 --- a/charts/tensor-fusion/templates/vector-config.yaml +++ b/charts/tensor-fusion/templates/vector-config.yaml @@ -8,30 +8,80 @@ data: api: enabled: true sources: - controller_metrics: - type: prometheus_scrape - endpoints: - - http://localhost:9000/metrics + metrics: + type: file + data_dir: /logs + include: + - /logs/metrics*.log + + kubernetes_logs: + type: kubernetes_logs + self_node_name: "${NODE_NAME}" + extra_label_selector: "tensor-fusion.ai/component in (operator)" + transforms: - prepare_controller_metrics: + parse_influx: type: remap - inputs: - - controller_metrics + inputs: + - metrics + source: | + . = parse_influxdb!(.message) + prepare_metrics: + type: remap + inputs: + - parse_influx source: | .namespace = "tf" + .tags.nodeName = "${NODE_NAME}" + + log_to_metric: + type: log_to_metric + inputs: + - prepare_metrics + all_metrics: true + metrics: [] + + prepare_kubernetes_logs: + type: remap + inputs: + - kubernetes_logs + source: | + .message = .message + .container = .kubernetes.container_name + .pod = .kubernetes.pod_name + .namespace = .kubernetes.pod_namespace + .component = .kubernetes.pod_labels."tensor-fusion.ai/component" + del(.kubernetes) + del(.file) + del(.source_type) sinks: - sink_greptimedb_controller_metrics: - type: prometheus_remote_write + sink_greptimedb_operator_metrics: + type: greptimedb_metrics inputs: - - prepare_controller_metrics + - log_to_metric + new_naming: false + endpoint: {{ .Values.greptime.host }}:{{ .Values.greptime.port }} + {{- if eq .Values.greptime.isCloud true }} + dbname: {{ .Values.greptime.db }} + username: {{ .Values.greptime.user }} + password: {{ .Values.greptime.password }} + tls: {} + {{- end }} + + sink_greptimedb_operator_logs: + type: greptimedb_logs + compression: gzip + table: tf_system_log + inputs: + - prepare_kubernetes_logs {{- if ne .Values.greptime.isCloud true }} - endpoint: http://{{ .Values.greptime.host }}:4000/v1/prometheus/write?db=public + endpoint: http://{{ .Values.greptime.host }}:4000 + dbname: public {{- else }} - endpoint: https://{{ .Values.greptime.host }}/v1/prometheus/write?db={{ .Values.greptime.db }} - auth: - strategy: basic - user: {{ .Values.greptime.user }} - password: {{ .Values.greptime.password }} + endpoint: https://{{ .Values.greptime.host }} + dbname: {{ .Values.greptime.db }} + username: {{ .Values.greptime.user }} + password: {{ .Values.greptime.password }} {{- end }} vector-hypervisor.yaml: | @@ -39,12 +89,15 @@ data: enabled: true sources: + kubernetes_logs: + type: kubernetes_logs + self_node_name: "${NODE_NAME}" + extra_label_selector: "tensor-fusion.ai/component in (hypervisor,worker)" metrics: type: file data_dir: /logs include: - - /logs/metrics.log - + - /logs/metrics.log.* transforms: parse_influx: type: remap @@ -67,6 +120,20 @@ data: all_metrics: true metrics: [] + prepare_kubernetes_logs: + type: remap + inputs: + - kubernetes_logs + source: | + .message = .message + .container = .kubernetes.container_name + .pod = .kubernetes.pod_name + .namespace = .kubernetes.pod_namespace + .component = .kubernetes.pod_labels."tensor-fusion.ai/component" + del(.kubernetes) + del(.file) + del(.source_type) + sinks: sink_greptimedb_hypervisor_metrics: type: greptimedb_metrics @@ -80,4 +147,18 @@ data: password: {{ .Values.greptime.password }} tls: {} {{- end }} - + sink_greptimedb_hypervisor_worker_logs: + type: greptimedb_logs + compression: gzip + table: tf_system_log + inputs: + - prepare_kubernetes_logs + {{- if ne .Values.greptime.isCloud true }} + endpoint: http://{{ .Values.greptime.host }}:4000 + dbname: public + {{- else }} + endpoint: https://{{ .Values.greptime.host }} + dbname: {{ .Values.greptime.db }} + username: {{ .Values.greptime.user }} + password: {{ .Values.greptime.password }} + {{- end }} diff --git a/charts/tensor-fusion/values.schema.json b/charts/tensor-fusion/values.schema.json index a3c112ed..6ff52262 100644 --- a/charts/tensor-fusion/values.schema.json +++ b/charts/tensor-fusion/values.schema.json @@ -288,7 +288,7 @@ "cloudEndpoint": { "type": "string", "description": "WebSocket endpoint for cloud communication", - "default": "wss://app.tensor-fusion.ai/_ws" + "default": "wss://app.tensor-fusion.ai" }, "image": { "type": "object", diff --git a/charts/tensor-fusion/values.yaml b/charts/tensor-fusion/values.yaml index 482e6f4f..4798299e 100644 --- a/charts/tensor-fusion/values.yaml +++ b/charts/tensor-fusion/values.yaml @@ -37,7 +37,18 @@ controller: podAnnotations: {} tolerations: [] - affinity: {} + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: tensor-fusion.ai/component + operator: In + values: + - operator + topologyKey: "kubernetes.io/hostname" livenessProbe: httpGet: path: /healthz @@ -94,7 +105,7 @@ greptime: agent: enrollToken: "token-from-cloud" agentId: 'org-from-cloud:env' - cloudEndpoint: "wss://app.tensor-fusion.ai/_ws" + cloudEndpoint: "wss://app.tensor-fusion.ai" image: repository: tensorfusion/tensor-fusion-agent diff --git a/cmd/main.go b/cmd/main.go index a12ae4f8..22ad1631 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -45,6 +45,8 @@ import ( "github.com/NexusGPU/tensor-fusion/internal/config" "github.com/NexusGPU/tensor-fusion/internal/controller" "github.com/NexusGPU/tensor-fusion/internal/gpuallocator" + "github.com/NexusGPU/tensor-fusion/internal/metrics" + "github.com/NexusGPU/tensor-fusion/internal/portallocator" "github.com/NexusGPU/tensor-fusion/internal/server" "github.com/NexusGPU/tensor-fusion/internal/server/router" "github.com/NexusGPU/tensor-fusion/internal/version" @@ -57,6 +59,8 @@ var ( setupLog = ctrl.Log.WithName("setup") ) +const LeaderElectionID = "85104305.tensor-fusion.ai" + func init() { utilruntime.Must(clientgoscheme.AddToScheme(scheme)) @@ -64,6 +68,7 @@ func init() { // +kubebuilder:scaffold:scheme } +//nolint:gocyclo func main() { var metricsAddr string var enableLeaderElection bool @@ -72,6 +77,9 @@ func main() { var enableHTTP2 bool var tlsOpts []func(*tls.Config) var gpuInfoConfig string + var metricsPath string + var nodeLevelPortRange string + var clusterLevelPortRange string flag.StringVar(&metricsAddr, "metrics-bind-address", "0", "The address the metrics endpoint binds to. "+ "Use :8443 for HTTPS or :8080 for HTTP, or leave as 0 to disable the metrics service.") @@ -85,6 +93,12 @@ func main() { "If set, HTTP/2 will be enabled for the metrics and webhook servers") flag.StringVar(&gpuInfoConfig, "gpu-info-config", "/etc/tensor-fusion/gpu-info.yaml", "specify the path to gpuInfoConfig file") + flag.StringVar(&metricsPath, "metrics-path", "/logs/metrics.log", "specify the path to metrics file") + flag.StringVar(&nodeLevelPortRange, "host-port-range", "40000-42000", + "specify the port range for assigning ports to pre-scheduled Pods such as vGPU workers") + flag.StringVar(&clusterLevelPortRange, "cluster-host-port-range", "42000-62000", + "specify the port range for assigning ports to random Pods"+ + " marked with `tensor-fusion.ai/host-port: auto` and `tensor-fusion.ai/port-name: ssh`") opts := zap.Options{ Development: true, } @@ -120,9 +134,13 @@ func main() { ctrl.Log.Error(err, "unable to read gpuInfoConfig file") gpuInfos = make([]config.GpuInfo, 0) } + gpuPricingMap := make(map[string]float64) + for _, gpuInfo := range gpuInfos { + gpuPricingMap[gpuInfo.FullModelName] = gpuInfo.CostPerHour + } // Watch configMap change with interval, check lastModifiedTime to reload gpuInfoConfig - watchGPUInfoChanges(gpuInfoConfig, &gpuInfos) + watchGPUInfoChanges(gpuInfoConfig, &gpuInfos, gpuPricingMap) // Metrics endpoint is enabled in 'config/default/kustomization.yaml'. The Metrics options configure the server. // More info: @@ -154,7 +172,7 @@ func main() { WebhookServer: webhookServer, HealthProbeBindAddress: probeAddr, LeaderElection: enableLeaderElection, - LeaderElectionID: "85104305.tensor-fusion.ai", + LeaderElectionID: LeaderElectionID, // LeaderElectionReleaseOnCancel defines if the leader should step down voluntarily // when the Manager ends. This requires the binary to immediately end when the // Manager is stopped, otherwise, this setting is unsafe. Setting this significantly @@ -174,12 +192,32 @@ func main() { ctx := context.Background() + metricsRecorder := metrics.MetricsRecorder{ + MetricsOutputPath: metricsPath, + HourlyUnitPriceMap: gpuPricingMap, + + // Worker level map will be updated by cluster reconcile + // Key is poolName, second level key is QoS level + WorkerUnitPriceMap: make(map[string]map[string]metrics.RawBillingPricing), + } + + startMetricsRecorder(enableLeaderElection, mgr, metricsRecorder) + // Initialize GPU allocator and set up watches allocator := gpuallocator.NewGpuAllocator(ctx, mgr.GetClient(), 10*time.Second) if _, err = allocator.SetupWithManager(ctx, mgr); err != nil { setupLog.Error(err, "unable to set up GPU allocator watches") os.Exit(1) } + + // Initialize Port allocator and set up watches + portAllocator, err := portallocator.NewPortAllocator(ctx, mgr.GetClient(), nodeLevelPortRange, clusterLevelPortRange) + if err != nil { + setupLog.Error(err, "unable to set up port allocator") + os.Exit(1) + } + _ = portAllocator.SetupWithManager(ctx, mgr) + if err = (&controller.TensorFusionConnectionReconciler{ Client: mgr.GetClient(), Scheme: mgr.GetScheme(), @@ -199,16 +237,17 @@ func main() { // nolint:goconst if os.Getenv("ENABLE_WEBHOOKS") != "false" { - if err = webhookcorev1.SetupPodWebhookWithManager(mgr); err != nil { + if err = webhookcorev1.SetupPodWebhookWithManager(mgr, portAllocator); err != nil { setupLog.Error(err, "unable to create webhook", "webhook", "Pod") os.Exit(1) } } if err = (&controller.TensorFusionClusterReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - Recorder: mgr.GetEventRecorderFor("TensorFusionCluster"), + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Recorder: mgr.GetEventRecorderFor("TensorFusionCluster"), + MetricsRecorder: &metricsRecorder, }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "TensorFusionCluster") os.Exit(1) @@ -255,8 +294,9 @@ func main() { os.Exit(1) } if err = (&controller.PodReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + PortAllocator: portAllocator, }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "Pod") os.Exit(1) @@ -278,11 +318,12 @@ func main() { os.Exit(1) } if err = (&controller.TensorFusionWorkloadReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - Allocator: allocator, - Recorder: mgr.GetEventRecorderFor("tensorfusionworkload"), - GpuInfos: &gpuInfos, + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Allocator: allocator, + Recorder: mgr.GetEventRecorderFor("tensorfusionworkload"), + GpuInfos: &gpuInfos, + PortAllocator: portAllocator, }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "TensorFusionWorkload") os.Exit(1) @@ -308,7 +349,12 @@ func main() { setupLog.Error(err, "failed to create connection router") os.Exit(1) } - httpServer := server.NewHTTPServer(connectionRouter) + assignHostPortRouter, err := router.NewAssignHostPortRouter(ctx, portAllocator) + if err != nil { + setupLog.Error(err, "failed to create assign host port router") + os.Exit(1) + } + httpServer := server.NewHTTPServer(connectionRouter, assignHostPortRouter) go func() { err := httpServer.Run() if err != nil { @@ -339,7 +385,18 @@ func main() { } } -func watchGPUInfoChanges(gpuInfoConfig string, gpuInfos *[]config.GpuInfo) { +func startMetricsRecorder(enableLeaderElection bool, mgr manager.Manager, metricsRecorder metrics.MetricsRecorder) { + if enableLeaderElection { + go func() { + <-mgr.Elected() + metricsRecorder.Start() + }() + } else { + go metricsRecorder.Start() + } +} + +func watchGPUInfoChanges(gpuInfoConfig string, gpuInfos *[]config.GpuInfo, gpuPricingMap map[string]float64) { var lastModTime time.Time if fileInfo, err := os.Stat(gpuInfoConfig); err == nil { lastModTime = fileInfo.ModTime() @@ -367,6 +424,9 @@ func watchGPUInfoChanges(gpuInfoConfig string, gpuInfos *[]config.GpuInfo) { } *gpuInfos = updatedGpuInfos + for _, gpuInfo := range updatedGpuInfos { + gpuPricingMap[gpuInfo.FullModelName] = gpuInfo.CostPerHour + } lastModTime = currentModTime ctrl.Log.Info("gpuInfo reloaded successfully.", "gpuInfoConfig", gpuInfoConfig) } diff --git a/cmd/nodediscovery/main.go b/cmd/nodediscovery/main.go index a3accb91..3641f01f 100644 --- a/cmd/nodediscovery/main.go +++ b/cmd/nodediscovery/main.go @@ -22,6 +22,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/rest" "k8s.io/client-go/tools/clientcmd" "k8s.io/client-go/util/retry" @@ -200,7 +201,12 @@ func createOrUpdateTensorFusionGPU( }, } - err := retry.OnError(retry.DefaultBackoff, func(err error) bool { + err := retry.OnError(wait.Backoff{ + Steps: 10, + Duration: time.Second, + Factor: 1.0, + Jitter: 0.1, + }, func(err error) bool { return true // Retry on all errors for now }, func() error { _, err := controllerutil.CreateOrUpdate(ctx, k8sClient, gpu, func() error { @@ -253,6 +259,7 @@ func createOrUpdateTensorFusionGPU( NodeSelector: map[string]string{ "kubernetes.io/hostname": k8sNodeName, }, + RunningApps: []*tfv1.RunningAppDetail{}, } if gpu.Status.Available == nil { diff --git a/config/crd/bases/tensor-fusion.ai_gpunodes.yaml b/config/crd/bases/tensor-fusion.ai_gpunodes.yaml index 6cd2c886..c798cb3c 100644 --- a/config/crd/bases/tensor-fusion.ai_gpunodes.yaml +++ b/config/crd/bases/tensor-fusion.ai_gpunodes.yaml @@ -86,66 +86,19 @@ spec: status: description: GPUNodeStatus defines the observed state of GPUNode. properties: - allocationDetails: - description: Allocation details is for node compaction, and calculate - used apps + allocationInfo: items: properties: - limits: - properties: - tflops: - anyOf: - - type: integer - - type: string - description: Tera floating point operations per second - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - vram: - anyOf: - - type: integer - - type: string - description: VRAM is short for Video memory, namely GPU - RAM - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - type: object - namespace: - type: string - podID: - type: string - podName: - type: string - qos: - enum: - - low - - medium - - high - - critical + count: + description: Worker count + type: integer + name: + description: Workload name namespace type: string - requests: - properties: - tflops: - anyOf: - - type: integer - - type: string - description: Tera floating point operations per second - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - vram: - anyOf: - - type: integer - - type: string - description: VRAM is short for Video memory, namely GPU - RAM - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - type: object - workload: + namespace: type: string required: - - limits - - namespace - - requests + - count type: object type: array availableTFlops: diff --git a/config/crd/bases/tensor-fusion.ai_gpupools.yaml b/config/crd/bases/tensor-fusion.ai_gpupools.yaml index c2257300..781418d0 100644 --- a/config/crd/bases/tensor-fusion.ai_gpupools.yaml +++ b/config/crd/bases/tensor-fusion.ai_gpupools.yaml @@ -546,7 +546,8 @@ spec: description: Default requests and limitsOverRequests are same, indicates normal on-demand serverless GPU usage, in hands-on lab low QoS case, limitsOverRequests should - be cheaper, for example Low QoS, ratio should be 0.5 + be lower, so that user can get burstable GPU resources + with very low cost type: string qos: enum: @@ -704,6 +705,9 @@ spec: readyNodes: format: int32 type: integer + runningAppsCnt: + format: int32 + type: integer savedCostsPerMonth: type: string totalGPUs: diff --git a/config/crd/bases/tensor-fusion.ai_gpus.yaml b/config/crd/bases/tensor-fusion.ai_gpus.yaml index ace87bc4..09a14f86 100644 --- a/config/crd/bases/tensor-fusion.ai_gpus.yaml +++ b/config/crd/bases/tensor-fusion.ai_gpus.yaml @@ -116,6 +116,21 @@ spec: - Destroying - Migrating type: string + runningApps: + items: + properties: + count: + description: Worker count + type: integer + name: + description: Workload name namespace + type: string + namespace: + type: string + required: + - count + type: object + type: array uuid: type: string required: diff --git a/config/crd/bases/tensor-fusion.ai_tensorfusionclusters.yaml b/config/crd/bases/tensor-fusion.ai_tensorfusionclusters.yaml index b7bc95c5..3cb00209 100644 --- a/config/crd/bases/tensor-fusion.ai_tensorfusionclusters.yaml +++ b/config/crd/bases/tensor-fusion.ai_tensorfusionclusters.yaml @@ -650,8 +650,8 @@ spec: description: Default requests and limitsOverRequests are same, indicates normal on-demand serverless GPU usage, in hands-on lab low QoS case, limitsOverRequests - should be cheaper, for example Low QoS, ratio - should be 0.5 + should be lower, so that user can get burstable + GPU resources with very low cost type: string qos: enum: diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 82095a0f..ea3cf175 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -116,6 +116,18 @@ rules: - patch - update - watch +- apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - tensor-fusion.ai resources: diff --git a/config/webhook/manifests.yaml b/config/webhook/manifests.yaml index 5db5cdf3..1975a164 100644 --- a/config/webhook/manifests.yaml +++ b/config/webhook/manifests.yaml @@ -23,6 +23,7 @@ webhooks: resources: - pods sideEffects: None + timeoutSeconds: 30 objectSelector: matchExpressions: - key: tensor-fusion.ai/enabled diff --git a/go.mod b/go.mod index 175bdb9e..4e63759a 100644 --- a/go.mod +++ b/go.mod @@ -11,20 +11,20 @@ require ( github.com/aws/smithy-go v1.22.3 github.com/gin-contrib/gzip v1.2.3 github.com/gin-gonic/gin v1.10.1 + github.com/influxdata/line-protocol/v2 v2.2.1 github.com/lithammer/shortuuid/v4 v4.2.0 github.com/onsi/ginkgo/v2 v2.23.4 github.com/onsi/gomega v1.37.0 - github.com/prometheus/client_golang v1.22.0 github.com/samber/lo v1.50.0 github.com/shirou/gopsutil v3.21.11+incompatible github.com/stretchr/testify v1.10.0 - golang.org/x/exp v0.0.0-20241217172543-b2144cdd0a67 gomodules.xyz/jsonpatch/v2 v2.5.0 + gopkg.in/natefinch/lumberjack.v2 v2.2.1 k8s.io/api v0.33.1 k8s.io/apimachinery v0.33.1 k8s.io/client-go v0.33.1 k8s.io/component-helpers v0.33.1 - k8s.io/utils v0.0.0-20241210054802-24370beab758 + k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979 sigs.k8s.io/controller-runtime v0.21.0 sigs.k8s.io/yaml v1.4.0 ) @@ -86,6 +86,7 @@ require ( github.com/pelletier/go-toml/v2 v2.2.3 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect + github.com/prometheus/client_golang v1.22.0 // indirect github.com/prometheus/client_model v0.6.1 // indirect github.com/prometheus/common v0.62.0 // indirect github.com/prometheus/procfs v0.15.1 // indirect @@ -109,15 +110,16 @@ require ( go.uber.org/multierr v1.11.0 // indirect go.uber.org/zap v1.27.0 // indirect golang.org/x/arch v0.15.0 // indirect - golang.org/x/crypto v0.36.0 // indirect - golang.org/x/net v0.38.0 // indirect + golang.org/x/crypto v0.38.0 // indirect + golang.org/x/exp v0.0.0-20250506013437-ce4c2cf36ca6 // indirect + golang.org/x/net v0.40.0 // indirect golang.org/x/oauth2 v0.27.0 // indirect - golang.org/x/sync v0.12.0 // indirect - golang.org/x/sys v0.32.0 // indirect - golang.org/x/term v0.30.0 // indirect - golang.org/x/text v0.23.0 // indirect + golang.org/x/sync v0.14.0 // indirect + golang.org/x/sys v0.33.0 // indirect + golang.org/x/term v0.32.0 // indirect + golang.org/x/text v0.25.0 // indirect golang.org/x/time v0.9.0 // indirect - golang.org/x/tools v0.31.0 // indirect + golang.org/x/tools v0.33.0 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20241223144023-3abc09e42ca8 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20241223144023-3abc09e42ca8 // indirect google.golang.org/grpc v1.69.2 // indirect diff --git a/go.sum b/go.sum index d556026c..1248b05f 100644 --- a/go.sum +++ b/go.sum @@ -57,6 +57,10 @@ github.com/evanphx/json-patch/v5 v5.9.11/go.mod h1:3j+LviiESTElxA4p3EMKAB9HXj3/X github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k= +github.com/frankban/quicktest v1.11.0/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s= +github.com/frankban/quicktest v1.11.2/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s= +github.com/frankban/quicktest v1.13.0 h1:yNZif1OkDfNoDfb9zZa9aXIpejNR4F23Wely0c+Qdqk= +github.com/frankban/quicktest v1.13.0/go.mod h1:qLE0fzW0VuyUAJgPU19zByoIr0HtCHN/r/VLSOOIySU= github.com/fsnotify/fsnotify v1.8.0 h1:dAwr6QBTBZIkG8roQaJjGof0pp0EeF+tNV7YBP3F/8M= github.com/fsnotify/fsnotify v1.8.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= github.com/fxamacker/cbor/v2 v2.7.0 h1:iM5WgngdRBanHcxugY4JySA0nk1wZorNOpTgCMedv5E= @@ -109,7 +113,9 @@ github.com/google/cel-go v0.23.2 h1:UdEe3CvQh3Nv+E/j9r1Y//WO0K0cSyD7/y0bzyLIMI4= github.com/google/cel-go v0.23.2/go.mod h1:52Pb6QsDbC5kvgxvZhiL9QX1oZEkcUF/ZqaPx1J5Wwo= github.com/google/gnostic-models v0.6.9 h1:MU/8wDLif2qCXZmzncUQ/BOfxWfthHi63KqpoNbWqVw= github.com/google/gnostic-models v0.6.9/go.mod h1:CiWsm0s6BSQd1hRn8/QmxqB6BesYcbSZxsz9b0KuDBw= +github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= @@ -126,6 +132,13 @@ github.com/grpc-ecosystem/grpc-gateway/v2 v2.25.1 h1:VNqngBF40hVlDloBruUehVYC3Ar github.com/grpc-ecosystem/grpc-gateway/v2 v2.25.1/go.mod h1:RBRO7fro65R6tjKzYgLAFo0t1QEXY1Dp+i/bvpRiqiQ= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= +github.com/influxdata/line-protocol-corpus v0.0.0-20210519164801-ca6fa5da0184/go.mod h1:03nmhxzZ7Xk2pdG+lmMd7mHDfeVOYFyhOgwO61qWU98= +github.com/influxdata/line-protocol-corpus v0.0.0-20210922080147-aa28ccfb8937 h1:MHJNQ+p99hFATQm6ORoLmpUCF7ovjwEFshs/NHzAbig= +github.com/influxdata/line-protocol-corpus v0.0.0-20210922080147-aa28ccfb8937/go.mod h1:BKR9c0uHSmRgM/se9JhFHtTT7JTO67X23MtKMHtZcpo= +github.com/influxdata/line-protocol/v2 v2.0.0-20210312151457-c52fdecb625a/go.mod h1:6+9Xt5Sq1rWx+glMgxhcg2c0DUaehK+5TDcPZ76GypY= +github.com/influxdata/line-protocol/v2 v2.1.0/go.mod h1:QKw43hdUBg3GTk2iC3iyCxksNj7PX9aUSeYOYE/ceHY= +github.com/influxdata/line-protocol/v2 v2.2.1 h1:EAPkqJ9Km4uAxtMRgUubJyqAr6zgWM0dznKMLRauQRE= +github.com/influxdata/line-protocol/v2 v2.2.1/go.mod h1:DmB3Cnh+3oxmG6LOBIxce4oaL4CPj3OmMPgvauXh+tM= github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg= github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo= github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGwWFoC7ycTf1rcQZHOlsJ6N8= @@ -143,6 +156,7 @@ github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa02 github.com/klauspost/cpuid/v2 v2.2.10 h1:tBs3QSyvjDyFTq3uoc/9xFpCuOsJQFNPiAhYdw2skhE= github.com/klauspost/cpuid/v2 v2.2.10/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0= github.com/knz/go-libedit v1.10.1/go.mod h1:MZTVkCWyz0oBc7JOWP3wNAzd002ZbM/5hgShxwh4x8M= +github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= @@ -266,15 +280,15 @@ golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACk golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.36.0 h1:AnAEvhDddvBdpY+uR+MyHmuZzzNqXSe/GvuDeob5L34= -golang.org/x/crypto v0.36.0/go.mod h1:Y4J0ReaxCR1IMaabaSMugxJES1EpwhBHhv2bDHklZvc= +golang.org/x/crypto v0.38.0 h1:jt+WWG8IZlBnVbomuhg2Mdq0+BBQaHbtqHEFEigjUV8= +golang.org/x/crypto v0.38.0/go.mod h1:MvrbAqul58NNYPKnOra203SB9vpuZW0e+RRZV+Ggqjw= golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20191030013958-a1ab85dbe136/go.mod h1:JXzH8nQsPlswgeRAPE3MuO9GYsAcnJvJ4vnMwN/5qkY= -golang.org/x/exp v0.0.0-20241217172543-b2144cdd0a67 h1:1UoZQm6f0P/ZO0w1Ri+f+ifG/gXhegadRdwBIXEFWDo= -golang.org/x/exp v0.0.0-20241217172543-b2144cdd0a67/go.mod h1:qj5a5QZpwLU2NLQudwIN5koi3beDhSAlJwa67PuM98c= +golang.org/x/exp v0.0.0-20250506013437-ce4c2cf36ca6 h1:y5zboxd6LQAqYIhHnB48p0ByQ/GnQx2BE33L8BOHQkI= +golang.org/x/exp v0.0.0-20250506013437-ce4c2cf36ca6/go.mod h1:U6Lno4MTRCDY+Ba7aCcauB9T60gsv5s4ralQzP72ZoQ= golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs= golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= @@ -286,29 +300,29 @@ golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.38.0 h1:vRMAPTMaeGqVhG5QyLJHqNDwecKTomGeqbnfZyKlBI8= -golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8= +golang.org/x/net v0.40.0 h1:79Xs7wF06Gbdcg4kdCCIQArK11Z1hr5POQ6+fIYHNuY= +golang.org/x/net v0.40.0/go.mod h1:y0hY0exeL2Pku80/zKK7tpntoX23cqL3Oa6njdgRtds= golang.org/x/oauth2 v0.27.0 h1:da9Vo7/tDv5RH/7nZDz1eMGS/q1Vv1N/7FCrBhI9I3M= golang.org/x/oauth2 v0.27.0/go.mod h1:onh5ek6nERTohokkhCD/y2cV4Do3fxFHFuAejCkRWT8= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.12.0 h1:MHc5BpPuC30uJk597Ri8TV3CNZcTLu6B6z4lJy+g6Jw= -golang.org/x/sync v0.12.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/sync v0.14.0 h1:woo0S4Yywslg6hp4eUFjTVOyKt0RookbpAHG4c1HmhQ= +golang.org/x/sync v0.14.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.32.0 h1:s77OFDvIQeibCmezSnk/q6iAfkdiQaJi4VzroCFrN20= -golang.org/x/sys v0.32.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= -golang.org/x/term v0.30.0 h1:PQ39fJZ+mfadBm0y5WlL4vlM7Sx1Hgf13sMIY2+QS9Y= -golang.org/x/term v0.30.0/go.mod h1:NYYFdzHoI5wRh/h5tDMdMqCqPJZEuNqVR5xJLd/n67g= +golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw= +golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/term v0.32.0 h1:DR4lr0TjUs3epypdhTOkMmuF5CDFJ/8pOnbzMZPQ7bg= +golang.org/x/term v0.32.0/go.mod h1:uZG1FhGx848Sqfsq4/DlJr3xGGsYMu/L5GW4abiaEPQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY= -golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4= +golang.org/x/text v0.25.0 h1:qVyWApTSYLk/drJRO5mDlNYskwQznZmkpV2c8q9zls4= +golang.org/x/text v0.25.0/go.mod h1:WEdwpYrmk1qmdHvhkSTNPm3app7v4rsT8F2UD6+VHIA= golang.org/x/time v0.9.0 h1:EsRrnYcQiGH+5FfbgvV4AP7qEZstoyrHB0DzarOQ4ZY= golang.org/x/time v0.9.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= @@ -318,8 +332,8 @@ golang.org/x/tools v0.0.0-20191012152004-8de300cfc20a/go.mod h1:b+2E5dAYhXwXZwtn golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.31.0 h1:0EedkvKDbh+qistFTd0Bcwe/YLh4vHwWEkiI0toFIBU= -golang.org/x/tools v0.31.0/go.mod h1:naFTU+Cev749tSJRXJlna0T3WxKvb1kWEx15xA4SdmQ= +golang.org/x/tools v0.33.0 h1:4qz2S3zmRxbGIhDIAgjxvFutSvH5EfnsYrRBj0UI0bc= +golang.org/x/tools v0.33.0/go.mod h1:CIJMaWEY88juyUfo7UbgPqbC8rU2OqfAV1h2Qp0oMYI= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= @@ -348,10 +362,13 @@ gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA= gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= +gopkg.in/natefinch/lumberjack.v2 v2.2.1 h1:bBRl1b0OH9s/DuPhuXpNl+VtCaJXFZ5/uEFST95x9zc= +gopkg.in/natefinch/lumberjack.v2 v2.2.1/go.mod h1:YD8tP3GAjkrDg1eZH7EGmyESg/lsYskCTPBJVb9jqSc= gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.0/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= @@ -373,8 +390,8 @@ k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff h1:/usPimJzUKKu+m+TE36gUyGcf03XZEP0ZIKgKj35LS4= k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff/go.mod h1:5jIi+8yX4RIb8wk3XwBo5Pq2ccx4FP10ohkbSKCZoK8= -k8s.io/utils v0.0.0-20241210054802-24370beab758 h1:sdbE21q2nlQtFh65saZY+rRM6x6aJJI8IUa1AmH/qa0= -k8s.io/utils v0.0.0-20241210054802-24370beab758/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979 h1:jgJW5IePPXLGB8e/1wvd0Ich9QE97RvvF3a8J3fP/Lg= +k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= nullprogram.com/x/optparse v1.0.0/go.mod h1:KdyPE+Igbe0jQUrVfMqDMeJQIJZEuyV7pjYmp6pbG50= rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 h1:jpcvIRr3GLoUoEKRkHKSmGjxb6lWwrBlJsXc+eUYQHM= diff --git a/internal/alert/evaluation.go b/internal/alert/evaluation.go new file mode 100644 index 00000000..6b05069f --- /dev/null +++ b/internal/alert/evaluation.go @@ -0,0 +1,4 @@ +package alert + +// connect TSDB, eval every minute of all rules to generate alerts in mem, and check existing alert resolved or not +// send alerts API to alertmanager, let alertmanager to do deduplication, notification stuff diff --git a/internal/alert/rules.go b/internal/alert/rules.go new file mode 100644 index 00000000..b5d89882 --- /dev/null +++ b/internal/alert/rules.go @@ -0,0 +1,4 @@ +package alert + +// offer API for managing user configured alert rules, stored in configMap +// offer mem synced rules for evaluation routine to use diff --git a/internal/alert/setup.go b/internal/alert/setup.go new file mode 100644 index 00000000..6f33be57 --- /dev/null +++ b/internal/alert/setup.go @@ -0,0 +1,9 @@ +package alert + +// offer API to install/update prometheus alertmanager with configMap and values from configuration with a single statefulSet +// let user to manage and upgrade alertmanager by themselves +// wrap notification configurations and change config map then trigger reload like prometheus operator does (if not install by tensor-fusion, let user to use AlertManagerConfig by themselves, tensor-fusion will only trigger alert to pre-configured alertmanager endpoint) + +// use config map to manager alertmanager config + +// TODO: diff --git a/internal/cloudprovider/common/utils.go b/internal/cloudprovider/common/utils.go index ec1345f7..6b29a058 100644 --- a/internal/cloudprovider/common/utils.go +++ b/internal/cloudprovider/common/utils.go @@ -8,9 +8,10 @@ import ( "strings" "time" + "math/rand" + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" "github.com/NexusGPU/tensor-fusion/internal/cloudprovider/types" - "golang.org/x/exp/rand" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" "sigs.k8s.io/controller-runtime/pkg/log" @@ -205,7 +206,8 @@ func contains(slice []string, item string) bool { func generateRandomString(length int) string { const charset = "abcdefghijklmnopqrstuvwxyz" - rand.Seed(uint64(time.Now().UnixNano())) + source := rand.NewSource(time.Now().UnixNano()) + rand := rand.New(source) result := make([]byte, length) for i := range result { diff --git a/internal/config/gpupool_mock.go b/internal/config/gpupool_mock.go index 63ca4ff9..40b2f3e9 100644 --- a/internal/config/gpupool_mock.go +++ b/internal/config/gpupool_mock.go @@ -4,6 +4,7 @@ import ( "encoding/json" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/constants" "github.com/samber/lo" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/runtime" @@ -127,4 +128,33 @@ var MockGPUPoolSpec = &tfv1.GPUPoolSpec{ }, }, }, + QosConfig: &tfv1.QosConfig{ + Definitions: []tfv1.QosDefinition{ + { + Name: constants.QoSLevelMedium, + }, + { + Name: constants.QoSLevelHigh, + }, + }, + DefaultQoS: constants.QoSLevelMedium, + Pricing: []tfv1.QosPricing{ + { + Qos: constants.QoSLevelMedium, + Requests: tfv1.GPUResourcePricingUnit{ + PerFP16TFlopsPerHour: "2", + PerGBOfVRAMPerHour: "1", + }, + LimitsOverRequestsChargingRatio: "0.5", + }, + { + Qos: constants.QoSLevelHigh, + Requests: tfv1.GPUResourcePricingUnit{ + PerFP16TFlopsPerHour: "2", + PerGBOfVRAMPerHour: "1", + }, + LimitsOverRequestsChargingRatio: "0.8", + }, + }, + }, } diff --git a/internal/constants/constants.go b/internal/constants/constants.go index ed60ae7f..94b3f83e 100644 --- a/internal/constants/constants.go +++ b/internal/constants/constants.go @@ -18,8 +18,15 @@ const ( LabelKeyClusterOwner = Domain + "/cluster" LabelKeyNodeClass = Domain + "/node-class" LabelKeyPodTemplateHash = Domain + "/pod-template-hash" + LabelComponent = Domain + "/component" TrueStringValue = "true" + ComponentClient = "client" + ComponentWorker = "worker" + ComponentHypervisor = "hypervisor" + ComponentNodeDiscovery = "node-discovery" + ComponentOperator = "operator" + GPUNodePoolIdentifierLabelPrefix = Domain + "/pool-" GPUNodePoolIdentifierLabelFormat = Domain + "/pool-%s" NodeDeletionMark = Domain + "/should-delete" @@ -45,6 +52,11 @@ const ( IsLocalGPUAnnotation = Domain + "/is-local-gpu" NoStandaloneWorkerModeAnnotation = Domain + "/no-standalone-worker-mode" + GenHostPortLabel = Domain + "/host-port" + GenHostPortLabelValue = "auto" + GenHostPortNameLabel = Domain + "/port-name" + GenPortNumberAnnotation = Domain + "/port-number" + AutoScaleLimitsAnnotation = Domain + "/auto-limits" AutoScaleRequestsAnnotation = Domain + "/auto-requests" AutoScaleReplicasAnnotation = Domain + "/auto-replicas" @@ -72,6 +84,15 @@ const ( WorkerPodNameEnv = "POD_NAME" NamespaceEnv = "OPERATOR_NAMESPACE" NamespaceDefaultVal = "tensor-fusion-sys" + + KubernetesHostNameLabel = "kubernetes.io/hostname" + GiBToBytes = 1024 * 1024 * 1024 + HypervisorServiceAccountName = "tensor-fusion-hypervisor-sa" + + QoSLevelLow = "low" + QoSLevelMedium = "medium" + QoSLevelHigh = "high" + QoSLevelCritical = "critical" ) const ( @@ -123,3 +144,8 @@ const ( const TFDataPath = "/tmp/tensor-fusion/data" const DataVolumeName = "tf-data" const TensorFusionPoolManualCompaction = Domain + "/manual-compaction" + +const ( + LeaderInfoConfigMapName = "tensor-fusion-operator-leader-info" + LeaderInfoConfigMapLeaderIPKey = "leader-ip" +) diff --git a/internal/controller/gpu_controller.go b/internal/controller/gpu_controller.go index 1c11df63..79961272 100644 --- a/internal/controller/gpu_controller.go +++ b/internal/controller/gpu_controller.go @@ -88,6 +88,11 @@ func (r *GPUReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R return ctrl.Result{}, fmt.Errorf("node %s is not assigned to any pool", gpunode.Name) } + // No need to calculate patch since GPU's owner pool not changed + if gpu.Labels != nil && gpu.Labels[constants.GpuPoolKey] == poolName { + return ctrl.Result{}, nil + } + patch := client.MergeFrom(gpu.DeepCopy()) if gpu.Labels == nil { gpu.Labels = make(map[string]string) diff --git a/internal/controller/gpunode_controller.go b/internal/controller/gpunode_controller.go index ea28e9e6..b858b95c 100644 --- a/internal/controller/gpunode_controller.go +++ b/internal/controller/gpunode_controller.go @@ -26,13 +26,12 @@ import ( cloudprovider "github.com/NexusGPU/tensor-fusion/internal/cloudprovider" "github.com/NexusGPU/tensor-fusion/internal/cloudprovider/types" "github.com/NexusGPU/tensor-fusion/internal/constants" + "github.com/NexusGPU/tensor-fusion/internal/gpuallocator" "github.com/NexusGPU/tensor-fusion/internal/metrics" "github.com/NexusGPU/tensor-fusion/internal/utils" batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/equality" "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/tools/record" @@ -54,6 +53,7 @@ type GPUNodeReconciler struct { // +kubebuilder:rbac:groups=tensor-fusion.ai,resources=gpunodes,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=tensor-fusion.ai,resources=gpunodes/status,verbs=get;update;patch // +kubebuilder:rbac:groups=tensor-fusion.ai,resources=gpunodes/finalizers,verbs=update +// +kubebuilder:rbac:groups=coordination.k8s.io,resources=leases,verbs=get;list;watch;create;update;patch;delete // Reconcile GPU nodes func (r *GPUNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { @@ -80,6 +80,9 @@ func (r *GPUNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct } } + // remove from metrics map + metrics.RemoveNodeMetrics(node.Name) + switch node.Spec.ManageMode { case tfv1.GPUNodeManageModeAutoSelect: // Do nothing, but if it's managed by Karpenter, should come up with some way to tell Karpenter to terminate the GPU node @@ -215,45 +218,13 @@ func (r *GPUNodeReconciler) checkStatusAndUpdateVirtualCapacity(ctx context.Cont return true, nil } else { - gpuList, err := r.fetchAllOwnedGPUDevices(ctx, node) + gpuModels, err := gpuallocator.RefreshGPUNodeCapacity(ctx, r.Client, node, poolObj) if err != nil { return true, err } - if len(gpuList) == 0 { - // node discovery job not completed, check again - return true, nil - } - - statusCopy := node.Status.DeepCopy() - - node.Status.AvailableVRAM = resource.Quantity{} - node.Status.AvailableTFlops = resource.Quantity{} - node.Status.TotalTFlops = resource.Quantity{} - node.Status.TotalVRAM = resource.Quantity{} - - for _, gpu := range gpuList { - node.Status.AvailableVRAM.Add(gpu.Status.Available.Vram) - node.Status.AvailableTFlops.Add(gpu.Status.Available.Tflops) - node.Status.TotalVRAM.Add(gpu.Status.Capacity.Vram) - node.Status.TotalTFlops.Add(gpu.Status.Capacity.Tflops) - } // update metrics to get historical allocation line chart and trending - metrics.AllocatedTflopsPercent.WithLabelValues(node.Status.KubernetesNodeName, poolObj.Name).Set((node.Status.TotalTFlops.AsApproximateFloat64() - node.Status.AvailableTFlops.AsApproximateFloat64()) / node.Status.TotalTFlops.AsApproximateFloat64()) - metrics.AllocatedVramBytes.WithLabelValues(node.Status.KubernetesNodeName, poolObj.Name).Set(node.Status.TotalVRAM.AsApproximateFloat64() - node.Status.AvailableVRAM.AsApproximateFloat64()) - - virtualVRAM, virtualTFlops := r.CalculateVirtualCapacity(node, poolObj) - node.Status.VirtualTFlops = virtualTFlops - node.Status.VirtualVRAM = virtualVRAM - - node.Status.Phase = tfv1.TensorFusionGPUNodePhaseRunning - - if !equality.Semantic.DeepEqual(node.Status, statusCopy) { - err = r.Status().Update(ctx, node) - if err != nil { - return true, fmt.Errorf("failed to update GPU node status: %w", err) - } - } + metrics.SetNodeMetrics(node, poolObj, gpuModels) err = r.syncStatusToGPUDevices(ctx, node, tfv1.TensorFusionGPUPhaseRunning) if err != nil { @@ -305,21 +276,24 @@ func (r *GPUNodeReconciler) reconcileNodeDiscoveryJob( if err != nil { return fmt.Errorf("unmarshal pod template: %w", err) } - - templateCopy := podTmpl.Template.DeepCopy() - if templateCopy.Spec.Affinity == nil { - templateCopy.Spec.Affinity = &corev1.Affinity{} + tmpl := podTmpl.Template + if tmpl.Labels == nil { + tmpl.Labels = map[string]string{} } - if templateCopy.Spec.Affinity.NodeAffinity == nil { - templateCopy.Spec.Affinity.NodeAffinity = &corev1.NodeAffinity{} + tmpl.Labels[constants.LabelComponent] = constants.ComponentNodeDiscovery + if tmpl.Spec.Affinity == nil { + tmpl.Spec.Affinity = &corev1.Affinity{} } - if templateCopy.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution == nil { - templateCopy.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution = &corev1.NodeSelector{ + if tmpl.Spec.Affinity.NodeAffinity == nil { + tmpl.Spec.Affinity.NodeAffinity = &corev1.NodeAffinity{} + } + if tmpl.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution == nil { + tmpl.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution = &corev1.NodeSelector{ NodeSelectorTerms: make([]corev1.NodeSelectorTerm, 0), } } - templateCopy.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms = - append(templateCopy.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms, corev1.NodeSelectorTerm{ + tmpl.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms = + append(tmpl.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms, corev1.NodeSelectorTerm{ MatchFields: []corev1.NodeSelectorRequirement{ { Key: "metadata.name", @@ -329,19 +303,19 @@ func (r *GPUNodeReconciler) reconcileNodeDiscoveryJob( }, }) // allow job to run at any taint Nodes that marked as NoSchedule - if templateCopy.Spec.Tolerations == nil { - templateCopy.Spec.Tolerations = []corev1.Toleration{} + if tmpl.Spec.Tolerations == nil { + tmpl.Spec.Tolerations = []corev1.Toleration{} } - templateCopy.Spec.Tolerations = append(templateCopy.Spec.Tolerations, corev1.Toleration{ + tmpl.Spec.Tolerations = append(tmpl.Spec.Tolerations, corev1.Toleration{ Key: "NoSchedule", Operator: corev1.TolerationOpExists, }) - if len(templateCopy.Spec.Containers) > 0 { - if len(templateCopy.Spec.Containers[0].Env) == 0 { - templateCopy.Spec.Containers[0].Env = []corev1.EnvVar{} + if len(tmpl.Spec.Containers) > 0 { + if len(tmpl.Spec.Containers[0].Env) == 0 { + tmpl.Spec.Containers[0].Env = []corev1.EnvVar{} } - templateCopy.Spec.Containers[0].Env = append(templateCopy.Spec.Containers[0].Env, corev1.EnvVar{ + tmpl.Spec.Containers[0].Env = append(tmpl.Spec.Containers[0].Env, corev1.EnvVar{ Name: constants.NodeDiscoveryReportGPUNodeEnvName, Value: gpunode.Name, }) @@ -350,12 +324,14 @@ func (r *GPUNodeReconciler) reconcileNodeDiscoveryJob( // create node-discovery job job := &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{ - Name: getDiscoveryJobName(gpunode.Name), - Namespace: utils.CurrentNamespace(), + Name: getDiscoveryJobName(gpunode.Name), + Namespace: utils.CurrentNamespace(), + Labels: tmpl.Labels, + Annotations: tmpl.Annotations, }, Spec: batchv1.JobSpec{ TTLSecondsAfterFinished: ptr.To[int32](3600 * 10), - Template: *templateCopy, + Template: tmpl, }, } if err := r.Get(ctx, client.ObjectKeyFromObject(job), job); err != nil { @@ -367,7 +343,7 @@ func (r *GPUNodeReconciler) reconcileNodeDiscoveryJob( return fmt.Errorf("create node discovery job %w", err) } } else { - return fmt.Errorf("create node job %w", err) + return fmt.Errorf("create node discovery job %w", err) } } @@ -432,7 +408,7 @@ func (r *GPUNodeReconciler) createHypervisorPod(ctx context.Context, key client. if err != nil { return fmt.Errorf("failed to unmarshal pod template: %w", err) } - spec := podTmpl.Template.Spec.DeepCopy() + spec := podTmpl.Template.Spec if spec.NodeSelector == nil { spec.NodeSelector = make(map[string]string) } @@ -450,16 +426,24 @@ func (r *GPUNodeReconciler) createHypervisorPod(ctx context.Context, key client. ReadOnly: false, MountPath: constants.TFDataPath, }) + spec.ServiceAccountName = constants.HypervisorServiceAccountName newPod := &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: key.Name, Namespace: key.Namespace, - Labels: map[string]string{ - fmt.Sprintf(constants.GPUNodePoolIdentifierLabelFormat, pool.Name): "true", - constants.LabelKeyPodTemplateHash: utils.GetObjectHash(pool.Spec.ComponentConfig.Hypervisor), - }, + Labels: func() map[string]string { + mergedLabels := make(map[string]string) + for k, v := range podTmpl.Template.Labels { + mergedLabels[k] = v + } + mergedLabels[fmt.Sprintf(constants.GPUNodePoolIdentifierLabelFormat, pool.Name)] = "true" + mergedLabels[constants.LabelKeyPodTemplateHash] = utils.GetObjectHash(pool.Spec.ComponentConfig.Hypervisor) + mergedLabels[constants.LabelComponent] = constants.ComponentHypervisor + return mergedLabels + }(), + Annotations: podTmpl.Template.Annotations, }, - Spec: *spec, + Spec: spec, } if newPod.Spec.Tolerations == nil { @@ -579,31 +563,12 @@ func (r *GPUNodeReconciler) reconcileCloudVendorNode(ctx context.Context, node * return nil } -func (r *GPUNodeReconciler) CalculateVirtualCapacity(node *tfv1.GPUNode, pool *tfv1.GPUPool) (resource.Quantity, resource.Quantity) { - diskSize, _ := node.Status.NodeInfo.DataDiskSize.AsInt64() - ramSize, _ := node.Status.NodeInfo.RAMSize.AsInt64() - - virtualVRAM := node.Status.TotalVRAM.DeepCopy() - // TODO: panic if not set TFlopsOversellRatio - vTFlops := node.Status.TotalTFlops.AsApproximateFloat64() * (float64(pool.Spec.CapacityConfig.Oversubscription.TFlopsOversellRatio) / 100.0) - - virtualVRAM.Add(*resource.NewQuantity( - int64(float64(float64(diskSize)*float64(pool.Spec.CapacityConfig.Oversubscription.VRAMExpandToHostDisk)/100.0)), - resource.DecimalSI), - ) - virtualVRAM.Add(*resource.NewQuantity( - int64(float64(float64(ramSize)*float64(pool.Spec.CapacityConfig.Oversubscription.VRAMExpandToHostMem)/100.0)), - resource.DecimalSI), - ) - - return virtualVRAM, *resource.NewQuantity(int64(vTFlops), resource.DecimalSI) -} - // SetupWithManager sets up the controller with the Manager. func (r *GPUNodeReconciler) SetupWithManager(mgr ctrl.Manager) error { return ctrl.NewControllerManagedBy(mgr). For(&tfv1.GPUNode{}). Named("gpunode"). + // TODO: should not own node, let node_claim_controller to own node for cloud vendor VM nodes, Owns(&corev1.Node{}). Owns(&batchv1.Job{}). Owns(&corev1.Pod{}). diff --git a/internal/controller/gpunode_controller_test.go b/internal/controller/gpunode_controller_test.go index 15b370d7..18ceec48 100644 --- a/internal/controller/gpunode_controller_test.go +++ b/internal/controller/gpunode_controller_test.go @@ -41,7 +41,7 @@ var _ = Describe("GPUNode Controller", func() { By("checking that the k8s node name should be set") Eventually(func(g Gomega) { g.Expect(gpuNode.Status.KubernetesNodeName).Should(Equal(gpuNode.Name)) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) By("checking that the node discovery job is created") Eventually(func(g Gomega) { @@ -52,7 +52,7 @@ var _ = Describe("GPUNode Controller", func() { }, job)).Should(Succeed()) g.Expect(job.Spec.TTLSecondsAfterFinished).Should(Equal(ptr.To[int32](3600 * 10))) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) By("checking that the hypervisor pod is created") pod := &corev1.Pod{} @@ -63,13 +63,13 @@ var _ = Describe("GPUNode Controller", func() { }, pod) g.Expect(err).ShouldNot(HaveOccurred()) g.Expect(pod.Status.Phase).Should(Equal(corev1.PodRunning)) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) By("checking that the gpunode status phase should be running") Eventually(func(g Gomega) { gpunode := tfEnv.GetGPUNode(0, 0) g.Expect(gpunode.Status.Phase).Should(Equal(tfv1.TensorFusionGPUNodePhaseRunning)) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) By("checking the hypervisor pod should be recreated when enters terminated status") pod.Status.Phase = corev1.PodFailed @@ -82,7 +82,7 @@ var _ = Describe("GPUNode Controller", func() { }, newPod) g.Expect(err).ShouldNot(HaveOccurred()) g.Expect(newPod.UID).ShouldNot(Equal(pod.UID)) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) tfEnv.Cleanup() diff --git a/internal/controller/gpupool_controller.go b/internal/controller/gpupool_controller.go index 257c94cf..dc9982d6 100644 --- a/internal/controller/gpupool_controller.go +++ b/internal/controller/gpupool_controller.go @@ -159,6 +159,9 @@ func (r *GPUPoolReconciler) reconcilePoolCurrentCapacityAndReadiness(ctx context virtualAvailableVRAM := resource.Quantity{} virtualAvailableTFlops := resource.Quantity{} + runningAppsCnt := int32(0) + deduplicationMap := make(map[string]struct{}) + for _, node := range nodes.Items { totalGPUs = totalGPUs + node.Status.TotalGPUs totalVRAM.Add(node.Status.TotalVRAM) @@ -178,6 +181,13 @@ func (r *GPUPoolReconciler) reconcilePoolCurrentCapacityAndReadiness(ctx context if node.Status.VirtualAvailableTFlops != nil { virtualAvailableTFlops.Add(*node.Status.VirtualAvailableTFlops) } + + for _, runningApp := range node.Status.AllocationInfo { + if _, ok := deduplicationMap[runningApp.Name+"_"+runningApp.Namespace]; !ok { + runningAppsCnt++ + deduplicationMap[runningApp.Name+"_"+runningApp.Namespace] = struct{}{} + } + } } pool.Status.TotalGPUs = totalGPUs @@ -196,6 +206,8 @@ func (r *GPUPoolReconciler) reconcilePoolCurrentCapacityAndReadiness(ctx context pool.Status.VirtualTFlops = virtualTFlops pool.Status.VirtualVRAM = virtualVRAM + pool.Status.RunningAppsCnt = runningAppsCnt + allowScaleToZero := true if pool.Spec.CapacityConfig != nil && pool.Spec.CapacityConfig.MinResources != nil { minTFlops, _ := pool.Spec.CapacityConfig.MinResources.TFlops.AsInt64() diff --git a/internal/controller/gpupool_controller_test.go b/internal/controller/gpupool_controller_test.go index bfbfd728..49e276d2 100644 --- a/internal/controller/gpupool_controller_test.go +++ b/internal/controller/gpupool_controller_test.go @@ -19,6 +19,7 @@ package controller import ( "encoding/json" "fmt" + "time" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" "github.com/NexusGPU/tensor-fusion/internal/constants" @@ -42,7 +43,7 @@ var _ = Describe("GPUPool Controller", func() { Eventually(func(g Gomega) { pool := tfEnv.GetGPUPool(0) g.Expect(pool.Status.Phase).Should(Equal(tfv1.TensorFusionPoolPhaseRunning)) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) tfEnv.Cleanup() }) }) @@ -58,7 +59,7 @@ var _ = Describe("GPUPool Controller", func() { g.Expect(pool.Status.ComponentStatus.HypervisorVersion).To(Equal(oldHash)) g.Expect(pool.Status.ComponentStatus.HyperVisorUpdateProgress).To(BeZero()) g.Expect(pool.Status.ComponentStatus.HypervisorConfigSynced).To(BeFalse()) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) By("verifying hypervisor version should be updated upon configuration changes") updateHypervisorConfig(tfEnv) @@ -69,7 +70,7 @@ var _ = Describe("GPUPool Controller", func() { g.Expect(pool.Status.ComponentStatus.HypervisorVersion).To(Equal(newHash)) g.Expect(pool.Status.ComponentStatus.HyperVisorUpdateProgress).To(BeZero()) g.Expect(pool.Status.ComponentStatus.HypervisorConfigSynced).To(BeFalse()) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) tfEnv.Cleanup() }) @@ -79,7 +80,7 @@ var _ = Describe("GPUPool Controller", func() { AddPoolWithNodeCount(1). SetGpuCountPerNode(1). Build() - updateRollingUpdatePolicy(tfEnv, false, 100, "3s") + updateRollingUpdatePolicy(tfEnv, false, 100, "1s") _, oldHash := triggerHypervisorUpdate(tfEnv) verifyAllHypervisorPodHashConsistently(tfEnv, oldHash) tfEnv.Cleanup() @@ -99,7 +100,7 @@ var _ = Describe("GPUPool Controller", func() { verifyHypervisorUpdateProgressConsistently(tfEnv, 50) By("changing the batch inteval to trigger next update batch") - updateRollingUpdatePolicy(tfEnv, true, 50, "3s") + updateRollingUpdatePolicy(tfEnv, true, 50, "1s") verifyHypervisorPodHash(tfEnv.GetGPUNode(0, 1), newHash) verifyHypervisorUpdateProgress(tfEnv, 100) @@ -128,7 +129,7 @@ var _ = Describe("GPUPool Controller", func() { AddPoolWithNodeCount(2). SetGpuCountPerNode(1). Build() - updateRollingUpdatePolicy(tfEnv, true, 50, "3s") + updateRollingUpdatePolicy(tfEnv, true, 50, "1s") newHash, _ := triggerHypervisorUpdate(tfEnv) verifyHypervisorPodHash(tfEnv.GetGPUNode(0, 0), newHash) verifyHypervisorPodHash(tfEnv.GetGPUNode(0, 1), newHash) @@ -136,24 +137,12 @@ var _ = Describe("GPUPool Controller", func() { tfEnv.Cleanup() }) - // It("Should perform update according to non-divisible batch percentage", func() { - // tfEnv := NewTensorFusionEnvBuilder(). - // AddPoolWithNodeCount(3). - // SetGpuCountPerNode(1). - // Build() - // updateRollingUpdatePolicy(tfEnv, true, 66, "3s") - // newHash, _ := triggerHypervisorUpdate(tfEnv) - // verifyAllHypervisorPodHash(tfEnv, newHash) - // verifyHypervisorUpdateProgress(tfEnv, 100) - // tfEnv.Cleanup() - // }) - It("Should update all nodes at once if BatchPercentage is 100", func() { tfEnv := NewTensorFusionEnvBuilder(). AddPoolWithNodeCount(3). SetGpuCountPerNode(1). Build() - updateRollingUpdatePolicy(tfEnv, true, 100, "3s") + updateRollingUpdatePolicy(tfEnv, true, 100, "1s") newHash, _ := triggerHypervisorUpdate(tfEnv) verifyAllHypervisorPodHash(tfEnv, newHash) verifyHypervisorUpdateProgress(tfEnv, 100) @@ -172,7 +161,7 @@ var _ = Describe("GPUPool Controller", func() { g.Expect(pool.Status.ComponentStatus.WorkerVersion).To(Equal(oldHash)) g.Expect(pool.Status.ComponentStatus.WorkerUpdateProgress).To(BeZero()) g.Expect(pool.Status.ComponentStatus.WorkerConfigSynced).To(BeFalse()) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) By("verifying worker version should be updated upon configuration changes") updateWorkerConfig(tfEnv) @@ -183,7 +172,7 @@ var _ = Describe("GPUPool Controller", func() { g.Expect(pool.Status.ComponentStatus.WorkerVersion).To(Equal(newHash)) g.Expect(pool.Status.ComponentStatus.WorkerUpdateProgress).To(BeZero()) g.Expect(pool.Status.ComponentStatus.WorkerConfigSynced).To(BeFalse()) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) tfEnv.Cleanup() }) @@ -194,15 +183,15 @@ var _ = Describe("GPUPool Controller", func() { SetGpuCountPerNode(1). Build() - By("configuring a large enougth batch inteval to prevent next update batch") + By("configuring a large enough batch interval to prevent next update batch") updateRollingUpdatePolicy(tfEnv, true, 50, "10m") createWorkloads(tfEnv, 2) triggerWorkerUpdate(tfEnv) verifyWorkerPodContainerNameConsistently(1, "tensorfusion-worker") verifyWorkerUpdateProgressConsistently(tfEnv, 50) - By("changing the batch inteval to trigger next update batch") - updateRollingUpdatePolicy(tfEnv, true, 50, "3s") + By("changing the batch interval to trigger next update batch") + updateRollingUpdatePolicy(tfEnv, true, 50, "1s") verifyAllWorkerPodContainerName(tfEnv, "updated-name") verifyWorkerUpdateProgress(tfEnv, 100) @@ -215,7 +204,7 @@ var _ = Describe("GPUPool Controller", func() { AddPoolWithNodeCount(1). SetGpuCountPerNode(2). Build() - updateRollingUpdatePolicy(tfEnv, true, 50, "3s") + updateRollingUpdatePolicy(tfEnv, true, 50, "1s") createWorkloads(tfEnv, 2) triggerWorkerUpdate(tfEnv) verifyAllWorkerPodContainerName(tfEnv, "updated-name") @@ -229,7 +218,7 @@ var _ = Describe("GPUPool Controller", func() { AddPoolWithNodeCount(1). SetGpuCountPerNode(2). Build() - updateRollingUpdatePolicy(tfEnv, true, 100, "3s") + updateRollingUpdatePolicy(tfEnv, true, 100, "1s") createWorkloads(tfEnv, 2) triggerWorkerUpdate(tfEnv) verifyAllWorkerPodContainerName(tfEnv, "updated-name") @@ -248,7 +237,7 @@ var _ = Describe("GPUPool Controller", func() { g.Expect(pool.Status.ComponentStatus.ClientVersion).To(Equal(oldHash)) g.Expect(pool.Status.ComponentStatus.ClientUpdateProgress).To(BeZero()) g.Expect(pool.Status.ComponentStatus.ClientConfigSynced).To(BeFalse()) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) By("verifying client version should be updated upon configuration changes") updateClientConfig(tfEnv) @@ -259,7 +248,7 @@ var _ = Describe("GPUPool Controller", func() { g.Expect(pool.Status.ComponentStatus.ClientVersion).To(Equal(newHash)) g.Expect(pool.Status.ComponentStatus.ClientUpdateProgress).To(BeZero()) g.Expect(pool.Status.ComponentStatus.ClientConfigSynced).To(BeFalse()) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) tfEnv.Cleanup() }) @@ -389,7 +378,7 @@ func updateRollingUpdatePolicy(tfEnv *TensorFusionEnv, autoUpdate bool, batchPer g.Expect(newPolicy.AutoUpdate).Should(Equal(policy.AutoUpdate)) g.Expect(newPolicy.BatchPercentage).Should(Equal(policy.BatchPercentage)) g.Expect(newPolicy.BatchInterval).Should(Equal(policy.BatchInterval)) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) } func verifyGpuPoolClientHash(tfEnv *TensorFusionEnv, oldHash string) string { @@ -400,7 +389,7 @@ func verifyGpuPoolClientHash(tfEnv *TensorFusionEnv, oldHash string) string { newHash := utils.GetObjectHash(pool.Spec.ComponentConfig.Client) g.Expect(newHash).ShouldNot(Equal(oldHash)) g.Expect(pool.Status.ComponentStatus.ClientVersion).To(Equal(newHash)) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) return pool.Status.ComponentStatus.ClientVersion } @@ -413,7 +402,7 @@ func verifyGpuPoolHypervisorHash(tfEnv *TensorFusionEnv, oldHash string) string newHash := utils.GetObjectHash(pool.Spec.ComponentConfig.Hypervisor) g.Expect(newHash).ShouldNot(Equal(oldHash)) g.Expect(pool.Status.ComponentStatus.HypervisorVersion).To(Equal(newHash)) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) return pool.Status.ComponentStatus.HypervisorVersion } @@ -426,7 +415,7 @@ func verifyGpuPoolWorkerHash(tfEnv *TensorFusionEnv, oldHash string) string { newHash := utils.GetObjectHash(pool.Spec.ComponentConfig.Worker) g.Expect(newHash).ShouldNot(Equal(oldHash)) g.Expect(pool.Status.ComponentStatus.WorkerVersion).To(Equal(newHash)) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) return pool.Status.ComponentStatus.WorkerVersion } @@ -441,7 +430,7 @@ func verifyHypervisorPodHash(gpuNode *tfv1.GPUNode, hash string) { }, pod)).Should(Succeed()) g.Expect(pod.Labels[constants.LabelKeyPodTemplateHash]).Should(Equal(hash)) updatePodPhaseToRunning(pod, hash) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) } func verifyClientPodHash(index int, hash string) { @@ -451,7 +440,7 @@ func verifyClientPodHash(index int, hash string) { key := client.ObjectKey{Namespace: utils.CurrentNamespace(), Name: getClientPodName(index)} g.Expect(k8sClient.Get(ctx, key, pod)).Should(Succeed()) g.Expect(pod.Labels[constants.LabelKeyPodTemplateHash]).Should(Equal(hash)) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) } func verifyClientPodHashConsistently(index int, hash string) { @@ -461,7 +450,7 @@ func verifyClientPodHashConsistently(index int, hash string) { key := client.ObjectKey{Namespace: utils.CurrentNamespace(), Name: getClientPodName(index)} g.Expect(k8sClient.Get(ctx, key, pod)).Should(Succeed()) g.Expect(pod.Labels[constants.LabelKeyPodTemplateHash]).Should(Equal(hash)) - }, duration, interval).Should(Succeed()) + }).Should(Succeed()) } func verifyHypervisorPodHashConsistently(gpuNode *tfv1.GPUNode, hash string) { @@ -474,7 +463,7 @@ func verifyHypervisorPodHashConsistently(gpuNode *tfv1.GPUNode, hash string) { }, pod)).Should(Succeed()) g.Expect(pod.Labels[constants.LabelKeyPodTemplateHash]).Should(Equal(hash)) updatePodPhaseToRunning(pod, hash) - }, duration, interval).Should(Succeed()) + }).Should(Succeed()) } func verifyClientPodWasDeleted(index int) { @@ -482,7 +471,7 @@ func verifyClientPodWasDeleted(index int) { pod := &corev1.Pod{} key := client.ObjectKey{Namespace: utils.CurrentNamespace(), Name: getClientPodName(index)} g.Expect(k8sClient.Get(ctx, key, pod)).ShouldNot(Succeed()) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) } func verifyAllHypervisorPodHash(tfEnv *TensorFusionEnv, hash string) { @@ -499,7 +488,7 @@ func verifyAllHypervisorPodHash(tfEnv *TensorFusionEnv, hash string) { g.Expect(pod.Labels[constants.LabelKeyPodTemplateHash]).Should(Equal(hash)) updatePodPhaseToRunning(pod, hash) } - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) } // func verifyWorkerPodContainerName(workloadIndex int, name string) { @@ -513,7 +502,7 @@ func verifyAllHypervisorPodHash(tfEnv *TensorFusionEnv, hash string) { // for _, pod := range podList.Items { // g.Expect(pod.Spec.Containers[0].Name).Should(Equal(name)) // } -// }, timeout, interval).Should(Succeed()) +// }).Should(Succeed()) // } func verifyWorkerPodContainerNameConsistently(workloadIndex int, name string) { @@ -527,7 +516,7 @@ func verifyWorkerPodContainerNameConsistently(workloadIndex int, name string) { for _, pod := range podList.Items { g.Expect(pod.Spec.Containers[0].Name).Should(Equal(name)) } - }, duration, interval).Should(Succeed()) + }, 1*time.Second).Should(Succeed()) } func verifyAllWorkerPodContainerName(tfEnv *TensorFusionEnv, name string) { @@ -549,7 +538,7 @@ func verifyAllWorkerPodContainerName(tfEnv *TensorFusionEnv, name string) { } } - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) } func verifyAllHypervisorPodHashConsistently(tfEnv *TensorFusionEnv, hash string) { @@ -565,7 +554,7 @@ func verifyAllHypervisorPodHashConsistently(tfEnv *TensorFusionEnv, hash string) g.Expect(pod.Labels[constants.LabelKeyPodTemplateHash]).Should(Equal(hash)) updatePodPhaseToRunning(pod, hash) } - }, duration, interval).Should(Succeed()) + }).Should(Succeed()) } // func verifyAllWorkerPodContainerNameConsistently(tfEnv *TensorFusionEnv, name string) { @@ -600,7 +589,7 @@ func verifyHypervisorUpdateProgress(tfEnv *TensorFusionEnv, progress int32) { } else { g.Expect(pool.Status.ComponentStatus.HypervisorConfigSynced).To(BeFalse()) } - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) } func verifyWorkerUpdateProgress(tfEnv *TensorFusionEnv, progress int32) { @@ -613,7 +602,7 @@ func verifyWorkerUpdateProgress(tfEnv *TensorFusionEnv, progress int32) { } else { g.Expect(pool.Status.ComponentStatus.WorkerConfigSynced).To(BeFalse()) } - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) } func verifyClientUpdateProgress(tfEnv *TensorFusionEnv, progress int32) { @@ -626,7 +615,7 @@ func verifyClientUpdateProgress(tfEnv *TensorFusionEnv, progress int32) { } else { g.Expect(pool.Status.ComponentStatus.ClientConfigSynced).To(BeFalse()) } - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) } func verifyClientUpdateProgressConsistently(tfEnv *TensorFusionEnv, progress int32) { @@ -639,7 +628,7 @@ func verifyClientUpdateProgressConsistently(tfEnv *TensorFusionEnv, progress int } else { g.Expect(pool.Status.ComponentStatus.ClientConfigSynced).To(BeFalse()) } - }, duration, interval).Should(Succeed()) + }).Should(Succeed()) } func verifyHypervisorUpdateProgressConsistently(tfEnv *TensorFusionEnv, progress int32) { @@ -652,7 +641,7 @@ func verifyHypervisorUpdateProgressConsistently(tfEnv *TensorFusionEnv, progress } else { g.Expect(pool.Status.ComponentStatus.HypervisorConfigSynced).To(BeFalse()) } - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) } func verifyWorkerUpdateProgressConsistently(tfEnv *TensorFusionEnv, progress int32) { @@ -665,7 +654,7 @@ func verifyWorkerUpdateProgressConsistently(tfEnv *TensorFusionEnv, progress int } else { g.Expect(pool.Status.ComponentStatus.WorkerConfigSynced).To(BeFalse()) } - }, duration, interval).Should(Succeed()) + }).Should(Succeed()) } // no pod controller in EnvTest, need to manually update pod status @@ -684,7 +673,7 @@ func ensureGpuPoolIsRunning(tfEnv *TensorFusionEnv) { Eventually(func(g Gomega) { pool := tfEnv.GetGPUPool(0) g.Expect(pool.Status.Phase).Should(Equal(tfv1.TensorFusionPoolPhaseRunning)) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) } // no RepliaSet like controller in EnvTest, need to create by ourself @@ -716,7 +705,7 @@ func createClientPodByIndex(tfEnv *TensorFusionEnv, index int) { pod := &corev1.Pod{} key := client.ObjectKey{Namespace: utils.CurrentNamespace(), Name: getClientPodName(index)} g.Expect(k8sClient.Get(ctx, key, pod)).Should(Succeed()) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) } func createClientPods(tfEnv *TensorFusionEnv, count int) { @@ -751,7 +740,7 @@ func createClientPods(tfEnv *TensorFusionEnv, count int) { key := client.ObjectKey{Namespace: utils.CurrentNamespace(), Name: getClientPodName(i)} g.Expect(k8sClient.Get(ctx, key, pod)).Should(Succeed()) } - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) } func cleanupClientPods() { @@ -766,6 +755,7 @@ func createWorkloads(tfEnv *TensorFusionEnv, count int) { replicas := 1 workload := createTensorFusionWorkload(pool.Name, key, replicas) checkWorkerPodCount(workload) + checkWorkloadStatus(workload) } } diff --git a/internal/controller/pod_controller.go b/internal/controller/pod_controller.go index 7dbb997c..d3eacae7 100644 --- a/internal/controller/pod_controller.go +++ b/internal/controller/pod_controller.go @@ -19,9 +19,11 @@ package controller import ( "context" "fmt" + "strconv" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" "github.com/NexusGPU/tensor-fusion/internal/constants" + "github.com/NexusGPU/tensor-fusion/internal/portallocator" "github.com/NexusGPU/tensor-fusion/internal/utils" v1 "github.com/NexusGPU/tensor-fusion/internal/webhook/v1" "github.com/samber/lo" @@ -40,7 +42,8 @@ import ( // PodReconciler reconciles a Pod object type PodReconciler struct { client.Client - Scheme *runtime.Scheme + Scheme *runtime.Scheme + PortAllocator *portallocator.PortAllocator } // +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch;create;update;patch;delete;deletecollection @@ -59,6 +62,15 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R log.Error(err, "Failed to get Pod") return ctrl.Result{}, err } + + // Release cluster level port when Pod deleted + if !pod.DeletionTimestamp.IsZero() { + if pod.Annotations[constants.GenHostPortLabel] == constants.GenHostPortLabelValue { + podPortNumber, _ := strconv.Atoi(pod.Annotations[constants.GenPortNumberAnnotation]) + _ = r.PortAllocator.ReleaseClusterLevelHostPort(pod.Name, podPortNumber) + log.Info("Released port", "pod", pod.Name, "port", podPortNumber) + } + } // generate tensor fusion connections and apply to cluster tfConnection := generateTensorFusionConnection(pod) if tfConnection == nil { diff --git a/internal/controller/suite_test.go b/internal/controller/suite_test.go index 3ec5e77f..8945a295 100644 --- a/internal/controller/suite_test.go +++ b/internal/controller/suite_test.go @@ -35,6 +35,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/rest" + "k8s.io/client-go/util/retry" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/envtest" @@ -47,7 +48,10 @@ import ( "github.com/NexusGPU/tensor-fusion/internal/config" "github.com/NexusGPU/tensor-fusion/internal/constants" "github.com/NexusGPU/tensor-fusion/internal/gpuallocator" + "github.com/NexusGPU/tensor-fusion/internal/metrics" + "github.com/NexusGPU/tensor-fusion/internal/portallocator" "github.com/NexusGPU/tensor-fusion/internal/utils" + metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" // +kubebuilder:scaffold:imports ) @@ -60,16 +64,14 @@ var testEnv *envtest.Environment var ctx context.Context var cancel context.CancelFunc var allocator *gpuallocator.GpuAllocator - -const ( - timeout = time.Second * 10 - duration = time.Second * 5 - interval = time.Millisecond * 100 -) +var metricsRecorder *metrics.MetricsRecorder func TestControllers(t *testing.T) { RegisterFailHandler(Fail) - + SetDefaultEventuallyTimeout(6 * time.Second) + SetDefaultEventuallyPollingInterval(200 * time.Millisecond) + SetDefaultConsistentlyDuration(5 * time.Second) + SetDefaultConsistentlyPollingInterval(200 * time.Millisecond) RunSpecs(t, "Controller Suite") } @@ -119,12 +121,25 @@ var _ = BeforeSuite(func() { mgr, err := ctrl.NewManager(cfg, ctrl.Options{ Scheme: scheme.Scheme, + Metrics: metricsserver.Options{ + BindAddress: "0", + }, }) Expect(err).ToNot(HaveOccurred()) + + metricsRecorder = &metrics.MetricsRecorder{ + MetricsOutputPath: "./metrics.log", + HourlyUnitPriceMap: map[string]float64{ + "A100": 10, + }, + WorkerUnitPriceMap: make(map[string]map[string]metrics.RawBillingPricing), + } + err = (&TensorFusionClusterReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - Recorder: mgr.GetEventRecorderFor("TensorFusionCluster"), + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Recorder: mgr.GetEventRecorderFor("TensorFusionCluster"), + MetricsRecorder: metricsRecorder, }).SetupWithManager(mgr) Expect(err).ToNot(HaveOccurred()) @@ -142,12 +157,11 @@ var _ = BeforeSuite(func() { }).SetupWithManager(mgr) Expect(err).ToNot(HaveOccurred()) - // err = (&GPUPoolCompactionReconciler{ - // Client: mgr.GetClient(), - // Scheme: mgr.GetScheme(), - // Recorder: mgr.GetEventRecorderFor("GPUPoolCompaction"), - // }).SetupWithManager(mgr) - // Expect(err).ToNot(HaveOccurred()) + portAllocator, err := portallocator.NewPortAllocator(ctx, mgr.GetClient(), "40000-42000", "42001-60000") + if err != nil { + Expect(err).ToNot(HaveOccurred()) + } + _ = portAllocator.SetupWithManager(ctx, mgr) err = (&GPUNodeClassReconciler{ Client: mgr.GetClient(), @@ -162,8 +176,9 @@ var _ = BeforeSuite(func() { Expect(err).ToNot(HaveOccurred()) err = (&PodReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + PortAllocator: portAllocator, }).SetupWithManager(mgr) Expect(err).ToNot(HaveOccurred()) @@ -180,7 +195,7 @@ var _ = BeforeSuite(func() { }).SetupWithManager(mgr) Expect(err).ToNot(HaveOccurred()) - allocator = gpuallocator.NewGpuAllocator(ctx, mgr.GetClient(), 3*time.Second) + allocator = gpuallocator.NewGpuAllocator(ctx, mgr.GetClient(), 150*time.Millisecond) _, err = allocator.SetupWithManager(ctx, mgr) Expect(err).ToNot(HaveOccurred()) @@ -198,11 +213,12 @@ var _ = BeforeSuite(func() { Expect(err).ToNot(HaveOccurred()) err = (&TensorFusionWorkloadReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - Allocator: allocator, - Recorder: mgr.GetEventRecorderFor("TensorFusionWorkload"), - GpuInfos: config.MockGpuInfo(), + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Allocator: allocator, + Recorder: mgr.GetEventRecorderFor("TensorFusionWorkload"), + GpuInfos: config.MockGpuInfo(), + PortAllocator: portAllocator, }).SetupWithManager(mgr) Expect(err).ToNot(HaveOccurred()) @@ -232,15 +248,21 @@ type TensorFusionEnv struct { func (c *TensorFusionEnv) GetCluster() *tfv1.TensorFusionCluster { GinkgoHelper() tfc := &tfv1.TensorFusionCluster{} - Eventually(func(g Gomega) { - g.Expect(k8sClient.Get(ctx, c.clusterKey, tfc)).Should(Succeed()) - }).Should(Succeed()) + Expect(k8sClient.Get(ctx, c.clusterKey, tfc)).Should(Succeed()) return tfc } func (c *TensorFusionEnv) UpdateCluster(tfc *tfv1.TensorFusionCluster) { GinkgoHelper() - Expect(k8sClient.Update(ctx, tfc)).Should(Succeed()) + err := retry.RetryOnConflict(retry.DefaultBackoff, func() error { + latest := &tfv1.TensorFusionCluster{} + if err := k8sClient.Get(ctx, client.ObjectKeyFromObject(tfc), latest); err != nil { + return err + } + latest.Spec = tfc.Spec + return k8sClient.Update(ctx, latest) + }) + Expect(err).Should(Succeed()) } func (c *TensorFusionEnv) Cleanup() { @@ -260,7 +282,7 @@ func (c *TensorFusionEnv) Cleanup() { Eventually(func(g Gomega) { pool := &tfv1.GPUPool{} g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: c.getPoolName(poolIndex)}, pool)).Should(HaveOccurred()) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) delete(c.poolNodeMap, poolIndex) c.poolCount-- } @@ -269,7 +291,7 @@ func (c *TensorFusionEnv) Cleanup() { Eventually(func(g Gomega) { err := k8sClient.Get(ctx, c.clusterKey, tfc) g.Expect(err).Should(HaveOccurred()) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) } func (c *TensorFusionEnv) GetGPUPoolList() *tfv1.GPUPoolList { @@ -280,7 +302,7 @@ func (c *TensorFusionEnv) GetGPUPoolList() *tfv1.GPUPoolList { constants.LabelKeyOwner: c.clusterKey.Name, }))).Should(Succeed()) g.Expect(poolList.Items).Should(HaveLen(c.poolCount)) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) return poolList } @@ -289,7 +311,7 @@ func (c *TensorFusionEnv) GetGPUPool(poolIndex int) *tfv1.GPUPool { pool := &tfv1.GPUPool{} Eventually(func(g Gomega) { g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: c.getPoolName(poolIndex)}, pool)).Should(Succeed()) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) return pool } @@ -301,7 +323,7 @@ func (c *TensorFusionEnv) GetGPUNodeList(poolIndex int) *tfv1.GPUNodeList { fmt.Sprintf(constants.GPUNodePoolIdentifierLabelFormat, c.getPoolName(poolIndex)): "true", }))).Should(Succeed()) g.Expect(nodeList.Items).Should(HaveLen(len(c.poolNodeMap[poolIndex]))) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) return nodeList } @@ -310,7 +332,7 @@ func (c *TensorFusionEnv) GetGPUNode(poolIndex int, nodeIndex int) *tfv1.GPUNode node := &tfv1.GPUNode{} Eventually(func(g Gomega) { g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: c.getNodeName(poolIndex, nodeIndex)}, node)).Should(Succeed()) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) return node } @@ -321,7 +343,7 @@ func (c *TensorFusionEnv) DeleteGPUNode(poolIndex int, nodeIndex int) { Expect(k8sClient.Delete(ctx, node)).Should(Succeed()) Eventually(func(g Gomega) { g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: c.getNodeName(poolIndex, nodeIndex)}, node)).Should(HaveOccurred()) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) delete(c.poolNodeMap[poolIndex], nodeIndex) } @@ -333,7 +355,7 @@ func (c *TensorFusionEnv) GetNodeGpuList(poolIndex int, nodeIndex int) *tfv1.GPU constants.LabelKeyOwner: c.getNodeName(poolIndex, nodeIndex), }))).Should(Succeed()) g.Expect(gpuList.Items).Should(HaveLen(c.poolNodeMap[poolIndex][nodeIndex])) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) return gpuList } @@ -356,7 +378,7 @@ func (c *TensorFusionEnv) GetPoolGpuList(poolIndex int) *tfv1.GPUList { constants.GpuPoolKey: c.getPoolName(poolIndex), }))).Should(Succeed()) g.Expect(gpuList.Items).Should(HaveLen(poolGpuCount)) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) return gpuList } @@ -377,7 +399,7 @@ func (c *TensorFusionEnv) UpdateHypervisorStatus() { }), )).Should(Succeed()) g.Expect(podList.Items).Should(HaveLen(len(c.poolNodeMap[poolIndex]))) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) for _, pod := range podList.Items { pod.Status.Phase = corev1.PodRunning pod.Status.Conditions = append(pod.Status.Conditions, corev1.PodCondition{Type: corev1.PodReady, Status: corev1.ConditionTrue}) @@ -453,6 +475,14 @@ func (b *TensorFusionEnvBuilder) Build() *TensorFusionEnv { Name: b.clusterKey.Name, Namespace: b.clusterKey.Namespace, }, + Spec: tfv1.TensorFusionClusterSpec{ + GPUPools: []tfv1.GPUPoolDefinition{ + { + Name: fmt.Sprintf("pool-%d", b.poolCount), + SpecTemplate: *config.MockGPUPoolSpec, + }, + }, + }, } // construct pools @@ -477,7 +507,7 @@ func (b *TensorFusionEnvBuilder) Build() *TensorFusionEnv { constants.LabelKeyOwner: tfc.Name, }))).Should(Succeed()) g.Expect(gpuPoolList.Items).Should(HaveLen(b.poolCount)) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) // generate nodes selectors := strings.Split(constants.InitialGPUNodeSelector, "=") diff --git a/internal/controller/tensorfusioncluster_controller.go b/internal/controller/tensorfusioncluster_controller.go index 035f8853..5eb93ed7 100644 --- a/internal/controller/tensorfusioncluster_controller.go +++ b/internal/controller/tensorfusioncluster_controller.go @@ -19,6 +19,7 @@ package controller import ( "context" "fmt" + "strconv" "sync" "k8s.io/apimachinery/pkg/api/errors" @@ -35,6 +36,7 @@ import ( tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" "github.com/NexusGPU/tensor-fusion/internal/cloudprovider" "github.com/NexusGPU/tensor-fusion/internal/constants" + "github.com/NexusGPU/tensor-fusion/internal/metrics" utils "github.com/NexusGPU/tensor-fusion/internal/utils" corev1 "k8s.io/api/core/v1" @@ -46,8 +48,9 @@ import ( // TensorFusionClusterReconciler reconciles a TensorFusionCluster object type TensorFusionClusterReconciler struct { client.Client - Scheme *runtime.Scheme - Recorder record.EventRecorder + Scheme *runtime.Scheme + Recorder record.EventRecorder + MetricsRecorder *metrics.MetricsRecorder LastProcessedItems sync.Map } @@ -302,6 +305,7 @@ func (r *TensorFusionClusterReconciler) reconcileGPUPool(ctx context.Context, tf } err = r.Create(ctx, gpupool) anyPoolChanged = true + r.updateMetricsRecorder(ctx, gpupool) if err != nil { errors = append(errors, fmt.Errorf("failed to create GPUPool %s: %w", key, err)) continue @@ -315,6 +319,7 @@ func (r *TensorFusionClusterReconciler) reconcileGPUPool(ctx context.Context, tf errors = append(errors, fmt.Errorf("failed to update GPUPool %s: %w", key, err)) } anyPoolChanged = true + r.updateMetricsRecorder(ctx, existingPool) } } } @@ -412,3 +417,33 @@ func (r *TensorFusionClusterReconciler) SetupWithManager(mgr ctrl.Manager) error Owns(&tfv1.GPUPool{}). Complete(r) } + +// Update metrics recorder's raw billing map +func (r *TensorFusionClusterReconciler) updateMetricsRecorder(ctx context.Context, pool *tfv1.GPUPool) { + log := log.FromContext(ctx) + if pool.Spec.QosConfig == nil { + log.Info("QosConfig is nil, skip updating metrics recorder", "pool", pool.Name) + return + } + + qosConfig := pool.Spec.QosConfig + if _, ok := r.MetricsRecorder.WorkerUnitPriceMap[pool.Name]; !ok { + r.MetricsRecorder.WorkerUnitPriceMap[pool.Name] = make(map[string]metrics.RawBillingPricing) + } + pricingDetail := r.MetricsRecorder.WorkerUnitPriceMap[pool.Name] + for _, pricing := range qosConfig.Pricing { + tflopsPerHour, _ := strconv.ParseFloat(pricing.Requests.PerFP16TFlopsPerHour, 64) + vramPerHour, _ := strconv.ParseFloat(pricing.Requests.PerGBOfVRAMPerHour, 64) + limitOverRequestChargingRatio, _ := strconv.ParseFloat(pricing.LimitsOverRequestsChargingRatio, 64) + + pricingDetail[string(pricing.Qos)] = metrics.RawBillingPricing{ + TflopsPerSecond: tflopsPerHour / float64(3600), + VramPerSecond: vramPerHour / float64(3600), + + TflopsOverRequestPerSecond: tflopsPerHour / float64(3600) * limitOverRequestChargingRatio, + VramOverRequestPerSecond: vramPerHour / float64(3600) * limitOverRequestChargingRatio, + } + } + + log.V(5).Info("Updated metrics recorder", "pool", pool.Name, "pricing", pricingDetail) +} diff --git a/internal/controller/tensorfusionconnection_controller_test.go b/internal/controller/tensorfusionconnection_controller_test.go index 61449da4..87f685c8 100644 --- a/internal/controller/tensorfusionconnection_controller_test.go +++ b/internal/controller/tensorfusionconnection_controller_test.go @@ -103,7 +103,7 @@ var _ = Describe("TensorFusionConnection Controller", func() { g.Expect(connection.Status.Phase).Should(Equal(workerStatus.WorkerPhase)) connectionUrl := fmt.Sprintf("native+%s+%d+%s-%s", workerStatus.WorkerIp, workerStatus.WorkerPort, workerStatus.WorkerName, workerStatus.ResourceVersion) g.Expect(connection.Status.ConnectionURL).Should(Equal(connectionUrl)) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) }) It("should handle missing workload label", func() { @@ -122,7 +122,7 @@ var _ = Describe("TensorFusionConnection Controller", func() { Consistently(func(g Gomega) { g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(connectionNoLabel), connectionNoLabel)).Should(Succeed()) g.Expect(connectionNoLabel.Status.WorkerName).Should(Equal("")) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) // Clean up the test connection Expect(k8sClient.Delete(ctx, connectionNoLabel)).To(Succeed()) @@ -138,7 +138,7 @@ var _ = Describe("TensorFusionConnection Controller", func() { g.Expect(k8sClient.Get(ctx, typeNamespacedName, connection)).Should(Succeed()) workerStatus := workload.Status.WorkerStatuses[0] g.Expect(connection.Status.WorkerName).Should(Equal(workerStatus.WorkerName)) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) By("Updating the workload to mark the worker as failed") Expect(k8sClient.Get(ctx, workloadNamespacedName, workload)).To(Succeed()) @@ -154,7 +154,7 @@ var _ = Describe("TensorFusionConnection Controller", func() { g.Expect(connection.Status.Phase).Should(Equal(workerStatus.WorkerPhase)) connectionUrl := fmt.Sprintf("native+%s+%d+%s-%s", workerStatus.WorkerIp, workerStatus.WorkerPort, workerStatus.WorkerName, workerStatus.ResourceVersion) g.Expect(connection.Status.ConnectionURL).Should(Equal(connectionUrl)) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) }) It("should update status to WorkerPending when worker selection fails", func() { @@ -201,7 +201,7 @@ var _ = Describe("TensorFusionConnection Controller", func() { return false } return len(createdWorkload.Status.WorkerStatuses) == 0 - }, timeout, interval).Should(BeTrue()) + }).Should(BeTrue()) By("Creating a connection to the workload with no workers") failConnectionName := "test-connection-fail" @@ -230,7 +230,7 @@ var _ = Describe("TensorFusionConnection Controller", func() { return false } return failConnection.Status.Phase == tfv1.WorkerPending - }, timeout, interval).Should(BeTrue()) + }).Should(BeTrue()) By("Cleaning up test resources") Expect(k8sClient.Delete(ctx, failConnection)).To(Succeed()) diff --git a/internal/controller/tensorfusionworkload_controller.go b/internal/controller/tensorfusionworkload_controller.go index a10b0085..54cf63a8 100644 --- a/internal/controller/tensorfusionworkload_controller.go +++ b/internal/controller/tensorfusionworkload_controller.go @@ -20,7 +20,9 @@ import ( "context" "fmt" "sort" + "strconv" "strings" + "time" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/equality" @@ -38,20 +40,21 @@ import ( "github.com/NexusGPU/tensor-fusion/internal/constants" "github.com/NexusGPU/tensor-fusion/internal/gpuallocator" "github.com/NexusGPU/tensor-fusion/internal/metrics" + "github.com/NexusGPU/tensor-fusion/internal/portallocator" "github.com/NexusGPU/tensor-fusion/internal/utils" "github.com/NexusGPU/tensor-fusion/internal/worker" "github.com/lithammer/shortuuid/v4" - "github.com/prometheus/client_golang/prometheus" "github.com/samber/lo" ) // TensorFusionWorkloadReconciler reconciles a TensorFusionWorkload object type TensorFusionWorkloadReconciler struct { client.Client - Scheme *runtime.Scheme - Allocator *gpuallocator.GpuAllocator - Recorder record.EventRecorder - GpuInfos *[]config.GpuInfo + Scheme *runtime.Scheme + Allocator *gpuallocator.GpuAllocator + Recorder record.EventRecorder + GpuInfos *[]config.GpuInfo + PortAllocator *portallocator.PortAllocator } // +kubebuilder:rbac:groups=tensor-fusion.ai,resources=tensorfusionworkloads,verbs=get;list;watch;create;update;patch;delete @@ -59,6 +62,8 @@ type TensorFusionWorkloadReconciler struct { // +kubebuilder:rbac:groups=tensor-fusion.ai,resources=tensorfusionworkloads/finalizers,verbs=update // TensorFusionWorkload Reconciler +// +//nolint:gocyclo func (r *TensorFusionWorkloadReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { log := log.FromContext(ctx) log.Info("Reconciling TensorFusionWorkload", "request", req) @@ -107,7 +112,14 @@ func (r *TensorFusionWorkloadReconciler) Reconcile(ctx context.Context, req ctrl // Process pods with our finalizer for i := range podList.Items { pod := &podList.Items[i] - deleted := pod.DeletionTimestamp != nil + deleted := !pod.DeletionTimestamp.IsZero() + + if deleted { + metrics.RemoveWorkerMetrics(pod.Name, pod.DeletionTimestamp.Time) + podPort, _ := strconv.Atoi(pod.Annotations[constants.GenPortNumberAnnotation]) + _ = r.PortAllocator.ReleaseHostPort(pod.Spec.NodeName, podPort) + } + // Handle our GPU resource cleanup finalizer _, err := utils.HandleFinalizer(ctx, pod, r.Client, func(ctx context.Context, obj *corev1.Pod) (bool, error) { return r.handlePodGPUCleanup(ctx, pod, workload) @@ -128,6 +140,9 @@ func (r *TensorFusionWorkloadReconciler) Reconcile(ctx context.Context, req ctrl return ctrl.Result{}, nil } + // init metrics map if needed + handleMetricsRecorder(podList, workload) + // Fetch the GPUPool pool := &tfv1.GPUPool{} if err := r.Get(ctx, client.ObjectKey{Name: workload.Spec.PoolName}, pool); err != nil { @@ -239,6 +254,14 @@ func (r *TensorFusionWorkloadReconciler) reconcileScaling( return ctrl.Result{}, nil } +func handleMetricsRecorder(podList *corev1.PodList, workload *tfv1.TensorFusionWorkload) { + now := time.Now() + for i := range podList.Items { + pod := &podList.Items[i] + metrics.SetWorkerMetricsByWorkload(pod, workload, now) + } +} + func (r *TensorFusionWorkloadReconciler) tryStartWorker( ctx context.Context, workerGenerator *worker.WorkerGenerator, @@ -246,8 +269,14 @@ func (r *TensorFusionWorkloadReconciler) tryStartWorker( workload *tfv1.TensorFusionWorkload, hash string, ) (*corev1.Pod, error) { - port := workerGenerator.AllocPort() - pod, hash, err := workerGenerator.GenerateWorkerPod(gpus, fmt.Sprintf("%s-tf-worker-", workload.Name), workload.Namespace, port, workload.Spec.Resources.Limits, hash) + if len(gpus) == 0 || gpus[0].Labels == nil { + return nil, fmt.Errorf("no gpus or no labels, can not assign host port for worker") + } + port, err := r.PortAllocator.AssignHostPort(gpus[0].Status.NodeSelector[constants.KubernetesHostNameLabel]) + if err != nil { + return nil, fmt.Errorf("get host port %w", err) + } + pod, hash, err := workerGenerator.GenerateWorkerPod(gpus, fmt.Sprintf("%s-tf-worker-", workload.Name), workload.Namespace, port, workload.Spec.Resources.Requests, workload.Spec.Resources.Limits, hash) if err != nil { return nil, fmt.Errorf("generate worker pod %w", err) } @@ -287,23 +316,12 @@ func (r *TensorFusionWorkloadReconciler) scaleDownWorkers(ctx context.Context, w for i := range pods { podToDelete := &pods[i] - log.Info("Scaling down worker pod", "name", podToDelete.Name) + log.Info("Scaling down worker pod", "name", podToDelete.Name, "workload", workload.Name) // Delete the pod with foreground deletion policy // The finalizer will handle GPU resource cleanup if err := r.deletePod(ctx, podToDelete); err != nil { return err } - - labels := prometheus.Labels{ - "worker": podToDelete.Name, - "namespace": podToDelete.Namespace, - "pool": workload.Spec.PoolName, - } - metrics.GpuTflopsRequest.Delete(labels) - metrics.GpuTflopsLimit.Delete(labels) - metrics.VramBytesRequest.Delete(labels) - metrics.VramBytesLimit.Delete(labels) - metrics.GpuCount.Delete(labels) } return nil } @@ -339,7 +357,9 @@ func (r *TensorFusionWorkloadReconciler) handlePodGPUCleanup(ctx context.Context return types.NamespacedName{Name: gpuName} }) // Release GPU resources - if err := r.Allocator.Dealloc(ctx, workload.Spec.Resources.Requests, gpus); err != nil { + if err := r.Allocator.Dealloc(ctx, + tfv1.NameNamespace{Namespace: workload.Namespace, Name: workload.Name}, + workload.Spec.Resources.Requests, gpus); err != nil { log.Error(err, "Failed to release GPU resources, will retry", "gpus", gpus, "pod", pod.Name) return false, err } @@ -367,39 +387,29 @@ func (r *TensorFusionWorkloadReconciler) deletePod(ctx context.Context, pod *cor // scaleUpWorkers handles the scaling up of worker pods func (r *TensorFusionWorkloadReconciler) scaleUpWorkers(ctx context.Context, workerGenerator *worker.WorkerGenerator, workload *tfv1.TensorFusionWorkload, count int, hash string) (ctrl.Result, error) { log := log.FromContext(ctx) - + workloadNameNs := tfv1.NameNamespace{Namespace: workload.Namespace, Name: workload.Name} // Create worker pods for range count { // Schedule GPU for the worker - gpus, err := r.Allocator.Alloc(ctx, workload.Spec.PoolName, workload.Spec.Resources.Requests, workload.Spec.GPUCount, workload.Spec.GPUModel) + gpus, err := r.Allocator.Alloc(ctx, workload.Spec.PoolName, workloadNameNs, workload.Spec.Resources.Requests, workload.Spec.GPUCount, workload.Spec.GPUModel) if err != nil { r.Recorder.Eventf(workload, corev1.EventTypeWarning, "ScheduleGPUFailed", "Failed to schedule GPU: %v", err) return ctrl.Result{RequeueAfter: constants.PendingRequeueDuration}, nil } - pod, err := r.tryStartWorker(ctx, workerGenerator, gpus, workload, hash) + _, err = r.tryStartWorker(ctx, workerGenerator, gpus, workload, hash) if err != nil { // Try to release all allocated GPUs if pod creation fails gpus := lo.Map(gpus, func(gpu *tfv1.GPU, _ int) types.NamespacedName { return client.ObjectKeyFromObject(gpu) }) - releaseErr := r.Allocator.Dealloc(ctx, workload.Spec.Resources.Requests, gpus) + releaseErr := r.Allocator.Dealloc(ctx, workloadNameNs, workload.Spec.Resources.Requests, gpus) if releaseErr != nil { log.Error(releaseErr, "Failed to release GPU after pod creation failure", "gpus", gpus) } return ctrl.Result{}, fmt.Errorf("create worker pod: %w", err) } - labels := prometheus.Labels{ - "worker": pod.Name, - "namespace": pod.Namespace, - "pool": workload.Spec.PoolName, - } - metrics.GpuTflopsRequest.With(labels).Set(workload.Spec.Resources.Requests.Tflops.AsApproximateFloat64()) - metrics.GpuTflopsLimit.With(labels).Set(workload.Spec.Resources.Limits.Tflops.AsApproximateFloat64()) - metrics.VramBytesRequest.With(labels).Set(workload.Spec.Resources.Requests.Vram.AsApproximateFloat64()) - metrics.VramBytesLimit.With(labels).Set(workload.Spec.Resources.Limits.Vram.AsApproximateFloat64()) - metrics.GpuCount.With(labels).Set(float64(workload.Spec.GPUCount)) } return ctrl.Result{}, nil diff --git a/internal/controller/tensorfusionworkload_controller_test.go b/internal/controller/tensorfusionworkload_controller_test.go index ecb7ea74..be27da46 100644 --- a/internal/controller/tensorfusionworkload_controller_test.go +++ b/internal/controller/tensorfusionworkload_controller_test.go @@ -17,6 +17,7 @@ limitations under the License. package controller import ( + "bytes" "strings" "time" @@ -99,7 +100,7 @@ var _ = Describe("TensorFusionWorkload Controller", func() { gpuNames := strings.Split(podList.Items[0].Annotations[constants.GpuKey], ",") g.Expect(gpuNames).Should(HaveLen(2)) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) checkWorkloadStatus(workload) }) @@ -136,7 +137,14 @@ var _ = Describe("TensorFusionWorkload Controller", func() { client.InNamespace(key.Namespace), client.MatchingLabels{constants.WorkloadKey: key.Name})).Should(Succeed()) g.Expect(podList.Items).Should(HaveLen(2)) - }, timeout, interval).Should(Succeed()) + + // Check if metrics is recorded correctly + byteWriter := bytes.NewBuffer([]byte{}) + metricsRecorder.RecordMetrics(byteWriter) + str := byteWriter.String() + g.Expect(str).Should(MatchRegexp("raw_cost=\\d+")) + + }).Should(Succeed()) // Store the original pod template hash var originalPodNames []string @@ -171,7 +179,7 @@ var _ = Describe("TensorFusionWorkload Controller", func() { for _, originalName := range originalPodNames { g.Expect(newPodNames).NotTo(ContainElement(originalName)) } - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) checkWorkloadStatus(workload) }) @@ -212,7 +220,7 @@ var _ = Describe("TensorFusionWorkload Controller", func() { return gpu.Status.Available.Tflops.Equal(resource.MustParse("1990")) && gpu.Status.Available.Vram.Equal(resource.MustParse("1992Gi")) }) return ok - }, timeout, interval).Should(BeTrue()) + }).Should(BeTrue()) Expect(k8sClient.Get(ctx, key, workload)).Should(Succeed()) workloadCopy := workload.DeepCopy() @@ -224,14 +232,14 @@ var _ = Describe("TensorFusionWorkload Controller", func() { client.InNamespace(key.Namespace), client.MatchingLabels{constants.WorkloadKey: key.Name})).To(Succeed()) g.Expect(podList.Items).Should(BeEmpty()) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) Eventually(func(g Gomega) { gpu := &tfv1.GPU{} g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(&updatedGPU), gpu)).NotTo(HaveOccurred()) g.Expect(gpu.Status.Available.Tflops.Equal(resource.MustParse("2000"))).Should(BeTrue()) g.Expect(gpu.Status.Available.Vram.Equal(resource.MustParse("2000Gi"))).Should(BeTrue()) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) }) }) @@ -248,7 +256,7 @@ var _ = Describe("TensorFusionWorkload Controller", func() { workload.Spec.GPUModel = "mock" // Update the workload g.Expect(k8sClient.Update(ctx, workload)).To(Succeed()) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) checkWorkerPodCount(workload) checkWorkloadStatus(workload) @@ -265,7 +273,7 @@ var _ = Describe("TensorFusionWorkload Controller", func() { return pod.DeletionTimestamp == nil }) g.Expect(podList.Items).Should(HaveLen(1)) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) // Now check if the pod has the correct GPU Eventually(func(g Gomega) { @@ -282,7 +290,7 @@ var _ = Describe("TensorFusionWorkload Controller", func() { }) g.Expect(ok).To(BeTrue()) g.Expect(gpu.Status.GPUModel).To(Equal("mock")) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) }) }) @@ -301,7 +309,7 @@ var _ = Describe("TensorFusionWorkload Controller", func() { client.InNamespace(key.Namespace), client.MatchingLabels{constants.WorkloadKey: key.Name})).To(Succeed()) g.Expect(podList.Items).To(HaveLen(2)) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) // delete workload Expect(k8sClient.Delete(ctx, workload)).To(Succeed()) @@ -313,14 +321,14 @@ var _ = Describe("TensorFusionWorkload Controller", func() { client.InNamespace(key.Namespace), client.MatchingLabels{constants.WorkloadKey: key.Name})).To(Succeed()) g.Expect(podList.Items).Should(BeEmpty()) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) // wait for workload itself to be deleted Eventually(func(g Gomega) { w := &tfv1.TensorFusionWorkload{} err := k8sClient.Get(ctx, key, w) g.Expect(err).To(HaveOccurred()) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) }) }) @@ -346,7 +354,7 @@ func checkWorkerPodCount(workload *tfv1.TensorFusionWorkload) { client.InNamespace(workload.Namespace), client.MatchingLabels{constants.WorkloadKey: workload.Name})).Should(Succeed()) g.Expect(podList.Items).Should(HaveLen(int(*workload.Spec.Replicas))) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) } func checkWorkloadStatus(in *tfv1.TensorFusionWorkload) { @@ -379,7 +387,7 @@ func checkWorkloadStatus(in *tfv1.TensorFusionWorkload) { g.Expect(readyCondition.Message).Should(ContainSubstring("Failed workers:")) } } - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) } func createTensorFusionWorkload(poolName string, key client.ObjectKey, replicas int) *tfv1.TensorFusionWorkload { @@ -410,6 +418,7 @@ func createTensorFusionWorkload(poolName string, key client.ObjectKey, replicas Vram: vramLimits, }, }, + Qos: constants.QoSLevelMedium, }, } @@ -417,7 +426,7 @@ func createTensorFusionWorkload(poolName string, key client.ObjectKey, replicas Eventually(func(g Gomega) { g.Expect(k8sClient.Get(ctx, key, workload)).Should(Succeed()) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) return workload } @@ -438,7 +447,7 @@ func cleanupWorkload(key client.ObjectKey) { g.Expect(k8sClient.Get(ctx, key, workload)).Should(Succeed()) workload.Spec.Replicas = ptr.Int32(0) g.Expect(k8sClient.Update(ctx, workload)).To(Succeed()) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) Eventually(func(g Gomega) { podList := &corev1.PodList{} @@ -446,12 +455,12 @@ func cleanupWorkload(key client.ObjectKey) { client.InNamespace(key.Namespace), client.MatchingLabels{constants.WorkloadKey: key.Name})).To(Succeed()) g.Expect(podList.Items).Should(BeEmpty()) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) Expect(k8sClient.Get(ctx, key, workload)).Should(Succeed()) Expect(k8sClient.Delete(ctx, workload)).To(Succeed()) Eventually(func(g Gomega) { err := k8sClient.Get(ctx, key, workload) g.Expect(err).Should(HaveOccurred()) - }, timeout, interval).Should(Succeed()) + }).Should(Succeed()) } diff --git a/internal/controller/workloadprofile_controller.go b/internal/controller/workloadprofile_controller.go index 9e352204..aa385a23 100644 --- a/internal/controller/workloadprofile_controller.go +++ b/internal/controller/workloadprofile_controller.go @@ -37,20 +37,9 @@ type WorkloadProfileReconciler struct { // +kubebuilder:rbac:groups=tensor-fusion.ai,resources=workloadprofiles/status,verbs=get;update;patch // +kubebuilder:rbac:groups=tensor-fusion.ai,resources=workloadprofiles/finalizers,verbs=update -// Reconcile is part of the main kubernetes reconciliation loop which aims to -// move the current state of the cluster closer to the desired state. -// TODO(user): Modify the Reconcile function to compare the state specified by -// the WorkloadProfile object against the actual cluster state, and then -// perform operations to make the cluster state reflect the state specified by -// the user. -// -// For more details, check Reconcile and its Result here: -// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.19.1/pkg/reconcile +// WorkloadProfile is a template to be referred by TensorFusionWorkload, no logic for reconcile func (r *WorkloadProfileReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { _ = log.FromContext(ctx) - - // TODO(user): your logic here - return ctrl.Result{}, nil } diff --git a/internal/gpuallocator/gpuallocator.go b/internal/gpuallocator/gpuallocator.go index c9ddbd79..99be3248 100644 --- a/internal/gpuallocator/gpuallocator.go +++ b/internal/gpuallocator/gpuallocator.go @@ -4,14 +4,20 @@ package gpuallocator import ( "context" "fmt" + "strings" "sync" "time" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" "github.com/NexusGPU/tensor-fusion/internal/constants" "github.com/NexusGPU/tensor-fusion/internal/gpuallocator/filter" + "github.com/samber/lo" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/tools/cache" + "k8s.io/client-go/util/retry" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/manager" @@ -51,6 +57,7 @@ type GpuAllocator struct { func (s *GpuAllocator) Alloc( ctx context.Context, poolName string, + workloadNameNamespace tfv1.NameNamespace, request tfv1.Resource, count uint, gpuModel string, @@ -101,7 +108,9 @@ func (s *GpuAllocator) Alloc( s.storeMutex.Lock() defer s.storeMutex.Unlock() + appAdded := false for _, selectedGPU := range selectedGPUs { + // Get the GPU from the store key := types.NamespacedName{Name: selectedGPU.Name, Namespace: selectedGPU.Namespace} gpu, exists := s.gpuStore[key] @@ -115,6 +124,11 @@ func (s *GpuAllocator) Alloc( gpu.Status.Available.Tflops.Sub(request.Tflops) gpu.Status.Available.Vram.Sub(request.Vram) + if !appAdded { + addRunningApp(ctx, gpu, workloadNameNamespace) + appAdded = true + } + s.markGPUDirty(key) } @@ -128,12 +142,13 @@ func (s *GpuAllocator) Alloc( return result, nil } -// Dealloc deallocates a request from one or multiple gpus. -func (s *GpuAllocator) Dealloc(ctx context.Context, request tfv1.Resource, gpus []types.NamespacedName) error { +// Dealloc a request from gpu to release available resources on it. +func (s *GpuAllocator) Dealloc(ctx context.Context, workloadNameNamespace tfv1.NameNamespace, request tfv1.Resource, gpus []types.NamespacedName) error { log := log.FromContext(ctx) s.storeMutex.Lock() defer s.storeMutex.Unlock() + appRemoved := false for _, gpu := range gpus { // Get the GPU from the store storeGPU, exists := s.gpuStore[gpu] @@ -145,6 +160,10 @@ func (s *GpuAllocator) Dealloc(ctx context.Context, request tfv1.Resource, gpus // Add resources back to the GPU storeGPU.Status.Available.Tflops.Add(request.Tflops) storeGPU.Status.Available.Vram.Add(request.Vram) + if !appRemoved { + removeRunningApp(ctx, storeGPU, workloadNameNamespace) + appRemoved = true + } s.markGPUDirty(gpu) } @@ -221,6 +240,10 @@ func (s *GpuAllocator) initGPUStore(ctx context.Context) error { } log.Info("GPU store initialized", "count", len(s.gpuStore)) + + // reconcile allocation state based on existing workers + s.reconcileAllocationState(ctx) + log.Info("GPU store data reconciled") return nil } @@ -378,6 +401,8 @@ func (s *GpuAllocator) syncToK8s(ctx context.Context) { s.storeMutex.RLock() defer s.storeMutex.RUnlock() + dirtyNodes := make(map[string]struct{}) + for _, key := range dirtyGPUs { gpu, exists := s.gpuStore[key] if !exists { @@ -386,6 +411,8 @@ func (s *GpuAllocator) syncToK8s(ctx context.Context) { // Create a copy to avoid modifying the memory store directly gpuCopy := gpu.DeepCopy() + dirtyNodes[gpuCopy.Labels[constants.LabelKeyOwner]] = struct{}{} + // Update the GPU status in Kubernetes if err := s.Status().Update(ctx, gpuCopy); err != nil { // If update fails, put the GPU back in the dirty queue @@ -395,6 +422,25 @@ func (s *GpuAllocator) syncToK8s(ctx context.Context) { log.Error(err, "Failed to update GPU status, will retry later", "gpu", key.String()) } } + + for nodeName := range dirtyNodes { + // Refer https://datatracker.ietf.org/doc/html/rfc6901#section-3 encode `/` as `~1` + patch := []byte(`[{ + "op": "add", + "path": "/metadata/annotations/` + strings.ReplaceAll(constants.GPULastReportTimeAnnotationKey, "/", "~1") + `", + "value": "` + time.Now().Format(time.RFC3339) + `" + }]`) + err := retry.RetryOnConflict(retry.DefaultBackoff, func() error { + return s.Patch(ctx, &tfv1.GPUNode{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + }, + }, client.RawPatch(types.JSONPatchType, patch)) + }) + if err != nil { + log.Error(err, "Failed to update GPU node last report time, will retry later", "node", nodeName) + } + } } // listGPUsFromPool gets GPUs from the specified pool using the in-memory store @@ -417,3 +463,109 @@ func (s *GpuAllocator) markGPUDirty(key types.NamespacedName) { defer s.dirtyQueueLock.Unlock() s.dirtyQueue[key] = struct{}{} } + +// When it's leader, should reconcile state based on existing workers +// this function is run inside storeMutex lock +func (s *GpuAllocator) reconcileAllocationState(ctx context.Context) { + logger := log.FromContext(ctx) + workers := &v1.PodList{} + if err := s.List(ctx, workers, client.MatchingLabels(map[string]string{ + constants.LabelComponent: constants.ComponentWorker, + })); err != nil { + logger.Error(err, "Failed to list Workloads to reconcile allocation state") + return + } + + tflopsCapacityMap := make(map[types.NamespacedName]resource.Quantity) + vramCapacityMap := make(map[types.NamespacedName]resource.Quantity) + gpuMap := make(map[types.NamespacedName]*tfv1.GPU) + + for gpuKey, gpu := range s.gpuStore { + if gpu.Status.Capacity != nil { + tflopsCapacityMap[gpuKey] = gpu.Status.Capacity.Tflops + vramCapacityMap[gpuKey] = gpu.Status.Capacity.Vram + gpu.Status.RunningApps = []*tfv1.RunningAppDetail{} + gpuMap[gpuKey] = gpu + } + } + + for _, worker := range workers.Items { + tflopsRequest, _ := resource.ParseQuantity(worker.Annotations[constants.TFLOPSRequestAnnotation]) + vramRequest, _ := resource.ParseQuantity(worker.Annotations[constants.VRAMRequestAnnotation]) + gpuIds := worker.Annotations[constants.GpuKey] + gpuIdsList := strings.Split(gpuIds, ",") + appAdded := false + for _, gpuId := range gpuIdsList { + gpuKey := types.NamespacedName{Name: gpuId} + gpuCapacity, ok := tflopsCapacityMap[gpuKey] + if ok { + gpuCapacity.Sub(tflopsRequest) + } + gpuCapacity, ok = vramCapacityMap[gpuKey] + if ok { + gpuCapacity.Sub(vramRequest) + } + if !appAdded { + addRunningApp(ctx, gpuMap[gpuKey], tfv1.NameNamespace{Namespace: worker.Namespace, Name: worker.Labels[constants.WorkloadKey]}) + appAdded = true + } + } + } + + for gpuKey, gpu := range s.gpuStore { + if gpu.Status.Capacity == nil { + log.FromContext(ctx).Info("[Warning] GPU capacity is nil, skip reconcile", "gpu", gpuKey.Name) + continue + } + sameTflops := gpu.Status.Available.Tflops.Equal(tflopsCapacityMap[gpuKey]) + sameVRAM := gpu.Status.Available.Vram.Equal(vramCapacityMap[gpuKey]) + if !sameTflops || !sameVRAM { + gpu.Status.Available.Tflops = tflopsCapacityMap[gpuKey] + gpu.Status.Available.Vram = vramCapacityMap[gpuKey] + s.markGPUDirty(gpuKey) + log.FromContext(ctx).Info("Correcting gpu available resources", "gpu", gpuKey.Name, "tflops", gpu.Status.Available.Tflops.String(), "vram", gpu.Status.Available.Vram.String()) + } + } +} + +func addRunningApp(ctx context.Context, gpu *tfv1.GPU, workloadNameNamespace tfv1.NameNamespace) { + if gpu == nil { + log.FromContext(ctx).Info("[Warning] GPU is nil, skip adding running app", "workload", workloadNameNamespace.Name, "namespace", workloadNameNamespace.Namespace) + return + } + if gpu.Status.RunningApps == nil { + gpu.Status.RunningApps = []*tfv1.RunningAppDetail{} + } + + item, found := lo.Find(gpu.Status.RunningApps, func(app *tfv1.RunningAppDetail) bool { + return app.Name == workloadNameNamespace.Name && app.Namespace == workloadNameNamespace.Namespace + }) + + if found { + item.Count++ + } else { + gpu.Status.RunningApps = append(gpu.Status.RunningApps, &tfv1.RunningAppDetail{ + Name: workloadNameNamespace.Name, + Namespace: workloadNameNamespace.Namespace, + Count: 1, + }) + } +} + +func removeRunningApp(ctx context.Context, gpu *tfv1.GPU, workloadNameNamespace tfv1.NameNamespace) { + item, found := lo.Find(gpu.Status.RunningApps, func(app *tfv1.RunningAppDetail) bool { + return app.Name == workloadNameNamespace.Name && app.Namespace == workloadNameNamespace.Namespace + }) + if found { + item.Count-- + if item.Count == 0 { + // scale down to zero, not running any more + gpu.Status.RunningApps = lo.Filter(gpu.Status.RunningApps, func(app *tfv1.RunningAppDetail, _ int) bool { + return app.Name != workloadNameNamespace.Name && app.Namespace != workloadNameNamespace.Namespace + }) + } + } else { + // should not happen, if deallocation twice, it should be a bug + log.FromContext(ctx).Info("[Warning] The app to remove not found, could be caused by deallocation twice bug", "gpu", gpu.Name, "namespace", gpu.Namespace, "workload", workloadNameNamespace.Name, "namespace", workloadNameNamespace.Namespace) + } +} diff --git a/internal/gpuallocator/gpuallocator_suite_test.go b/internal/gpuallocator/gpuallocator_suite_test.go index 6bf4f605..eb3a9418 100644 --- a/internal/gpuallocator/gpuallocator_suite_test.go +++ b/internal/gpuallocator/gpuallocator_suite_test.go @@ -265,6 +265,91 @@ var _ = BeforeSuite(func() { Expect(err).NotTo(HaveOccurred()) } + nodes := []tfv1.GPUNode{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node-1", + Labels: map[string]string{ + constants.LabelKeyOwner: "test-pool", + }, + }, + Spec: tfv1.GPUNodeSpec{ + ManageMode: tfv1.GPUNodeManageModeAutoSelect, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node-2", + Labels: map[string]string{ + constants.LabelKeyOwner: "test-pool", + }, + }, + Spec: tfv1.GPUNodeSpec{ + ManageMode: tfv1.GPUNodeManageModeAutoSelect, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node-3", + Labels: map[string]string{ + constants.LabelKeyOwner: "test-pool", + }, + }, + Spec: tfv1.GPUNodeSpec{ + ManageMode: tfv1.GPUNodeManageModeAutoSelect, + }, + }, + } + for _, node := range nodes { + err = k8sClient.Create(ctx, &node) + Expect(err).NotTo(HaveOccurred()) + } + + gpuNodeStatuses := []struct { + name string + status tfv1.GPUNodeStatus + }{ + { + name: "node-1", + status: tfv1.GPUNodeStatus{ + Phase: tfv1.TensorFusionGPUNodePhaseRunning, + TotalTFlops: resource.MustParse("200"), + TotalVRAM: resource.MustParse("48Gi"), + AvailableTFlops: resource.MustParse("180"), + AvailableVRAM: resource.MustParse("48Gi"), + }, + }, + { + name: "node-2", + status: tfv1.GPUNodeStatus{ + Phase: tfv1.TensorFusionGPUNodePhaseRunning, + TotalTFlops: resource.MustParse("120"), + TotalVRAM: resource.MustParse("24Gi"), + AvailableTFlops: resource.MustParse("120"), + AvailableVRAM: resource.MustParse("24Gi"), + }, + }, + { + name: "node-3", + status: tfv1.GPUNodeStatus{ + Phase: tfv1.TensorFusionGPUNodePhaseRunning, + TotalTFlops: resource.MustParse("150"), + TotalVRAM: resource.MustParse("48Gi"), + AvailableTFlops: resource.MustParse("150"), + AvailableVRAM: resource.MustParse("48Gi"), + }, + }, + } + + for _, gpuNodeStatus := range gpuNodeStatuses { + gpuNode := &tfv1.GPUNode{} + err = k8sClient.Get(ctx, types.NamespacedName{Name: gpuNodeStatus.name, Namespace: "default"}, gpuNode) + Expect(err).NotTo(HaveOccurred()) + gpuNode.Status = gpuNodeStatus.status + err = k8sClient.Status().Update(ctx, gpuNode) + Expect(err).NotTo(HaveOccurred()) + } + go func() { defer GinkgoRecover() err = mgr.Start(ctx) @@ -280,10 +365,18 @@ var _ = AfterSuite(func() { }) // Helper function to get a GPU from the API server -func getGPU(name string, namespace string) *tfv1.GPU { +func getGPU(name string) *tfv1.GPU { gpu := &tfv1.GPU{} - key := types.NamespacedName{Name: name, Namespace: namespace} + key := types.NamespacedName{Name: name} err := k8sClient.Get(ctx, key, gpu) ExpectWithOffset(1, err).NotTo(HaveOccurred()) return gpu } + +func getGPUNode(gpu *tfv1.GPU) *tfv1.GPUNode { + gpuNode := &tfv1.GPUNode{} + key := types.NamespacedName{Name: gpu.Labels[constants.LabelKeyOwner]} + err := k8sClient.Get(ctx, key, gpuNode) + ExpectWithOffset(1, err).NotTo(HaveOccurred()) + return gpuNode +} diff --git a/internal/gpuallocator/gpuallocator_test.go b/internal/gpuallocator/gpuallocator_test.go index 9e150762..32388c4e 100644 --- a/internal/gpuallocator/gpuallocator_test.go +++ b/internal/gpuallocator/gpuallocator_test.go @@ -30,11 +30,27 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" ) +var workloadNameNs = tfv1.NameNamespace{Namespace: "default", Name: "test-workload"} + var _ = Describe("GPU Allocator", func() { var allocator *GpuAllocator + allocateAndSync := func(poolName string, request tfv1.Resource, count uint, gpuModel string) ([]*tfv1.GPU, error) { + gpus, err := allocator.Alloc(ctx, poolName, workloadNameNs, request, count, gpuModel) + allocator.syncToK8s(ctx) + return gpus, err + } + + deallocateAndSync := func(gpus []*tfv1.GPU, request tfv1.Resource) { + err := allocator.Dealloc(ctx, workloadNameNs, request, lo.Map(gpus, func(gpu *tfv1.GPU, _ int) types.NamespacedName { + return client.ObjectKeyFromObject(gpu) + })) + Expect(err).NotTo(HaveOccurred()) + allocator.syncToK8s(ctx) + } + BeforeEach(func() { - allocator = NewGpuAllocator(ctx, k8sClient, 3*time.Second) + allocator = NewGpuAllocator(ctx, k8sClient, 150*time.Millisecond) readyCh, err := allocator.SetupWithManager(ctx, mgr) Expect(err).NotTo(HaveOccurred()) @@ -61,17 +77,30 @@ var _ = Describe("GPU Allocator", func() { Vram: resource.MustParse("8Gi"), } - gpus, err := allocator.Alloc(ctx, "test-pool", request, 1, "") + gpus, err := allocateAndSync("test-pool", request, 1, "") Expect(err).NotTo(HaveOccurred()) Expect(gpus).To(HaveLen(1)) - // Explicitly call syncToK8s to persist changes before verification - allocator.syncToK8s(ctx) + gpuNode := &tfv1.GPUNode{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: gpus[0].Labels[constants.LabelKeyOwner]}, gpuNode); err != nil { + Expect(err).NotTo(HaveOccurred()) + } + pool := &tfv1.GPUPool{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: "test-pool"}, pool); err != nil { + Expect(err).NotTo(HaveOccurred()) + } + _, _ = RefreshGPUNodeCapacity(ctx, k8sClient, gpuNode, pool) // Verify resources were reduced on the allocated GPU - gpu := getGPU(gpus[0].Name, gpus[0].Namespace) + gpu := getGPU(gpus[0].Name) Expect(gpu.Status.Available.Tflops.Cmp(gpu.Status.Capacity.Tflops)).To(Equal(-1)) Expect(gpu.Status.Available.Vram.Cmp(gpu.Status.Capacity.Vram)).To(Equal(-1)) + + node := getGPUNode(gpu) + diffTflops := node.Status.TotalTFlops.Value() - node.Status.AvailableTFlops.Value() + diffVRAM := node.Status.TotalVRAM.Value() - node.Status.AvailableVRAM.Value() + Expect(diffTflops).To(BeEquivalentTo(50)) + Expect(diffVRAM).To(BeEquivalentTo(8 * 1024 * 1024 * 1024)) }) It("should allocate multiple GPUs from the same node", func() { @@ -80,7 +109,7 @@ var _ = Describe("GPU Allocator", func() { Vram: resource.MustParse("4Gi"), } - gpus, err := allocator.Alloc(ctx, "test-pool", request, 2, "") + gpus, err := allocateAndSync("test-pool", request, 2, "") Expect(err).NotTo(HaveOccurred()) Expect(gpus).To(HaveLen(2)) @@ -97,7 +126,7 @@ var _ = Describe("GPU Allocator", func() { Vram: resource.MustParse("2Gi"), } - _, err := allocator.Alloc(ctx, "test-pool", request, 10, "") + _, err := allocateAndSync("test-pool", request, 10, "") Expect(err).To(HaveOccurred()) }) @@ -107,7 +136,7 @@ var _ = Describe("GPU Allocator", func() { Vram: resource.MustParse("64Gi"), } - _, err := allocator.Alloc(ctx, "test-pool", request, 1, "") + _, err := allocateAndSync("test-pool", request, 1, "") Expect(err).To(HaveOccurred()) }) @@ -117,7 +146,7 @@ var _ = Describe("GPU Allocator", func() { Vram: resource.MustParse("2Gi"), } - _, err := allocator.Alloc(ctx, "nonexistent-pool", request, 1, "") + _, err := allocateAndSync("nonexistent-pool", request, 1, "") Expect(err).To(HaveOccurred()) }) @@ -128,13 +157,12 @@ var _ = Describe("GPU Allocator", func() { } // Try allocating with a specific GPU model - gpus, err := allocator.Alloc(ctx, "test-pool", request, 1, "NVIDIA A100") + gpus, err := allocateAndSync("test-pool", request, 1, "NVIDIA A100") Expect(err).NotTo(HaveOccurred()) - Expect(gpus).To(HaveLen(1)) Expect(gpus[0].Status.GPUModel).To(Equal("NVIDIA A100")) // Try allocating with a non-existent GPU model - _, err = allocator.Alloc(ctx, "test-pool", request, 1, "NonExistentModel") + _, err = allocateAndSync("test-pool", request, 1, "NonExistentModel") Expect(err).To(HaveOccurred()) }) }) @@ -147,7 +175,7 @@ var _ = Describe("GPU Allocator", func() { Vram: resource.MustParse("6Gi"), } - gpus, err := allocator.Alloc(ctx, "test-pool", request, 1, "") + gpus, err := allocateAndSync("test-pool", request, 1, "") Expect(err).NotTo(HaveOccurred()) Expect(gpus).To(HaveLen(1)) @@ -157,11 +185,10 @@ var _ = Describe("GPU Allocator", func() { allocatedVram := allocatedGPU.Status.Available.Vram.DeepCopy() // Now deallocate - err = allocator.Dealloc(ctx, request, []types.NamespacedName{client.ObjectKeyFromObject(gpus[0])}) - Expect(err).NotTo(HaveOccurred()) + deallocateAndSync(gpus, request) // Verify resources were restored - deallocatedGPU := getGPU(allocatedGPU.Name, allocatedGPU.Namespace) + deallocatedGPU := getGPU(allocatedGPU.Name) expectedTflops := allocatedTflops.DeepCopy() expectedVram := allocatedVram.DeepCopy() expectedTflops.Add(request.Tflops) @@ -180,7 +207,7 @@ var _ = Describe("GPU Allocator", func() { } // Allocate 2 GPUs - allocatedGPUs, err := allocator.Alloc(ctx, "test-pool", request, 2, "") + allocatedGPUs, err := allocateAndSync("test-pool", request, 2, "") Expect(err).NotTo(HaveOccurred()) Expect(allocatedGPUs).To(HaveLen(2)) @@ -209,23 +236,14 @@ var _ = Describe("GPU Allocator", func() { vram: gpu.Status.Available.Vram.DeepCopy(), } } - gpusToDeallocKeys := lo.Map(gpusToDealloc, func(gpu *tfv1.GPU, _ int) types.NamespacedName { - return client.ObjectKeyFromObject(gpu) - }) + // Now deallocate all GPUs including the non-existent one - err = allocator.Dealloc(ctx, request, gpusToDeallocKeys) - Expect(err).NotTo(HaveOccurred()) + deallocateAndSync(gpusToDealloc, request) // Verify resources were restored for existing GPUs for _, allocatedGPU := range allocatedGPUs { - deallocatedGPU := getGPU(allocatedGPU.Name, allocatedGPU.Namespace) + deallocatedGPU := getGPU(allocatedGPU.Name) initialState := initialStates[allocatedGPU.Name] - - expectedTflops := initialState.tflops.DeepCopy() - expectedVram := initialState.vram.DeepCopy() - expectedTflops.Add(request.Tflops) - expectedVram.Add(request.Vram) - Expect(deallocatedGPU.Status.Available.Tflops.Cmp(initialState.tflops)).To(Equal(1)) Expect(deallocatedGPU.Status.Available.Vram.Cmp(initialState.vram)).To(Equal(1)) } @@ -280,7 +298,7 @@ var _ = Describe("GPU Allocator", func() { Expect(exists).To(BeTrue()) // Get the GPU from the API server - gpuToDelete := getGPU("gpu-1", "") + gpuToDelete := getGPU("gpu-1") // Handle the deletion event allocator.handleGPUDelete(ctx, gpuToDelete) diff --git a/internal/gpuallocator/node_capacity.go b/internal/gpuallocator/node_capacity.go new file mode 100644 index 00000000..302d1d04 --- /dev/null +++ b/internal/gpuallocator/node_capacity.go @@ -0,0 +1,85 @@ +package gpuallocator + +import ( + "context" + "fmt" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/constants" + "k8s.io/apimachinery/pkg/api/equality" + "k8s.io/apimachinery/pkg/api/resource" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +func RefreshGPUNodeCapacity(ctx context.Context, k8sClient client.Client, node *tfv1.GPUNode, pool *tfv1.GPUPool) ([]string, error) { + gpuList := &tfv1.GPUList{} + if err := k8sClient.List(ctx, gpuList, client.MatchingLabels{constants.LabelKeyOwner: node.Name}); err != nil { + return nil, fmt.Errorf("failed to list GPUs: %w", err) + } + if len(gpuList.Items) == 0 { + // node discovery job not completed, wait next reconcile loop to check again + return nil, nil + } + + statusCopy := node.Status.DeepCopy() + + node.Status.AvailableVRAM = resource.Quantity{} + node.Status.AvailableTFlops = resource.Quantity{} + node.Status.TotalTFlops = resource.Quantity{} + node.Status.TotalVRAM = resource.Quantity{} + node.Status.AllocationInfo = []*tfv1.RunningAppDetail{} + + gpuModels := []string{} + deduplicationMap := make(map[string]struct{}) + + for _, gpu := range gpuList.Items { + node.Status.AvailableVRAM.Add(gpu.Status.Available.Vram) + node.Status.AvailableTFlops.Add(gpu.Status.Available.Tflops) + node.Status.TotalVRAM.Add(gpu.Status.Capacity.Vram) + node.Status.TotalTFlops.Add(gpu.Status.Capacity.Tflops) + gpuModels = append(gpuModels, gpu.Status.GPUModel) + + for _, runningApp := range gpu.Status.RunningApps { + if _, ok := deduplicationMap[runningApp.Name+"_"+runningApp.Namespace]; !ok { + node.Status.AllocationInfo = append(node.Status.AllocationInfo, runningApp.DeepCopy()) + deduplicationMap[runningApp.Name+"_"+runningApp.Namespace] = struct{}{} + } + } + } + + virtualVRAM, virtualTFlops := calculateVirtualCapacity(node, pool) + node.Status.VirtualTFlops = virtualTFlops + node.Status.VirtualVRAM = virtualVRAM + + node.Status.Phase = tfv1.TensorFusionGPUNodePhaseRunning + + if !equality.Semantic.DeepEqual(node.Status, statusCopy) { + err := k8sClient.Status().Update(ctx, node) + if err != nil { + return nil, fmt.Errorf("failed to update GPU node status: %w", err) + } + } + return gpuModels, nil +} + +func calculateVirtualCapacity(node *tfv1.GPUNode, pool *tfv1.GPUPool) (resource.Quantity, resource.Quantity) { + diskSize, _ := node.Status.NodeInfo.DataDiskSize.AsInt64() + ramSize, _ := node.Status.NodeInfo.RAMSize.AsInt64() + + virtualVRAM := node.Status.TotalVRAM.DeepCopy() + if pool.Spec.CapacityConfig == nil || pool.Spec.CapacityConfig.Oversubscription == nil { + return virtualVRAM, node.Status.TotalTFlops.DeepCopy() + } + vTFlops := node.Status.TotalTFlops.AsApproximateFloat64() * (float64(pool.Spec.CapacityConfig.Oversubscription.TFlopsOversellRatio) / 100.0) + + virtualVRAM.Add(*resource.NewQuantity( + int64(float64(float64(diskSize)*float64(pool.Spec.CapacityConfig.Oversubscription.VRAMExpandToHostDisk)/100.0)), + resource.DecimalSI), + ) + virtualVRAM.Add(*resource.NewQuantity( + int64(float64(float64(ramSize)*float64(pool.Spec.CapacityConfig.Oversubscription.VRAMExpandToHostMem)/100.0)), + resource.DecimalSI), + ) + + return virtualVRAM, *resource.NewQuantity(int64(vTFlops), resource.DecimalSI) +} diff --git a/internal/metrics/recorder.go b/internal/metrics/recorder.go new file mode 100644 index 00000000..4f1629dc --- /dev/null +++ b/internal/metrics/recorder.go @@ -0,0 +1,286 @@ +package metrics + +import ( + "io" + "sync" + "time" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/constants" + metricsProto "github.com/influxdata/line-protocol/v2/lineprotocol" + "gopkg.in/natefinch/lumberjack.v2" + corev1 "k8s.io/api/core/v1" + ctrl "sigs.k8s.io/controller-runtime" +) + +// Worker level metrics, include worker resources/costs status +// map updated in one reconcile loop in single goroutine, thus no RW lock needed +var workerMetricsLock sync.RWMutex +var workerMetricsMap = map[string]*WorkerMetrics{} + +// Node level metrics, include node allocation/costs status +var nodeMetricsLock sync.RWMutex +var NodeMetricsMap = map[string]*NodeMetrics{} + +var log = ctrl.Log.WithName("metrics-recorder") + +type MetricsRecorder struct { + MetricsOutputPath string + + // Raw billing result for node and workers + HourlyUnitPriceMap map[string]float64 + + // Worker level unit price map, key is pool name, second level key is QoS level + WorkerUnitPriceMap map[string]map[string]RawBillingPricing +} + +func RemoveWorkerMetrics(workerName string, deletionTime time.Time) { + workerMetricsLock.Lock() + // to get more accurate metrics, should record the deletion timestamp to calculate duration for the last metrics + workerMetricsMap[workerName].DeletionTimestamp = deletionTime + workerMetricsLock.Unlock() +} + +func RemoveNodeMetrics(nodeName string) { + nodeMetricsLock.Lock() + // Node lifecycle is much longer than worker, so just delete the metrics, 1 minute metrics interval is enough + delete(NodeMetricsMap, nodeName) + nodeMetricsLock.Unlock() +} + +func SetWorkerMetricsByWorkload(pod *corev1.Pod, workload *tfv1.TensorFusionWorkload, now time.Time) { + workerMetricsLock.Lock() + defer workerMetricsLock.Unlock() + + // Initialize metrics + if _, ok := workerMetricsMap[pod.Name]; !ok { + workerMetricsMap[pod.Name] = &WorkerMetrics{ + WorkerName: pod.Name, + WorkloadName: workload.Name, + PoolName: workload.Spec.PoolName, + Namespace: pod.Namespace, + QoS: string(workload.Spec.Qos), + RawCost: 0, + LastRecordTime: now, + } + } + + // Update metrics fields that are mutable + metricsItem := workerMetricsMap[pod.Name] + metricsItem.TflopsRequest = workload.Spec.Resources.Requests.Tflops.AsApproximateFloat64() + metricsItem.TflopsLimit = workload.Spec.Resources.Limits.Tflops.AsApproximateFloat64() + metricsItem.VramBytesRequest = workload.Spec.Resources.Requests.Vram.AsApproximateFloat64() + metricsItem.VramBytesLimit = workload.Spec.Resources.Limits.Vram.AsApproximateFloat64() + if workload.Spec.GPUCount <= 0 { + // handle invalid data if exists + metricsItem.GPUCount = 1 + } else { + metricsItem.GPUCount = int(workload.Spec.GPUCount) + } + metricsItem.WorkloadName = workload.Name + +} + +func SetNodeMetrics(node *tfv1.GPUNode, poolObj *tfv1.GPUPool, gpuModels []string) { + nodeMetricsLock.Lock() + defer nodeMetricsLock.Unlock() + + if _, ok := NodeMetricsMap[node.Name]; !ok { + NodeMetricsMap[node.Name] = &NodeMetrics{ + NodeName: node.Name, + RawCost: 0, + LastRecordTime: time.Now(), + } + } + // Fields that possibly change after initialization + metricsItem := NodeMetricsMap[node.Name] + metricsItem.PoolName = poolObj.Name + metricsItem.GPUModels = gpuModels + + totalTflops := node.Status.TotalTFlops.AsApproximateFloat64() + totalVram := node.Status.TotalVRAM.AsApproximateFloat64() + + metricsItem.AllocatedTflops = totalTflops - node.Status.AvailableTFlops.AsApproximateFloat64() + if totalTflops <= 0 { + metricsItem.AllocatedTflopsPercent = 0 + } else { + metricsItem.AllocatedTflopsPercent = metricsItem.AllocatedTflops / totalTflops * 100 + } + + metricsItem.AllocatedVramBytes = totalVram - node.Status.AvailableVRAM.AsApproximateFloat64() + if totalVram <= 0 { + metricsItem.AllocatedVramPercent = 0 + } else { + metricsItem.AllocatedVramPercent = metricsItem.AllocatedVramBytes / totalVram * 100 + } + + totalVirtualTflops := node.Status.VirtualTFlops.AsApproximateFloat64() + totalVirtualVram := node.Status.VirtualVRAM.AsApproximateFloat64() + if totalVirtualTflops <= 0 { + metricsItem.AllocatedTflopsPercentToVirtualCap = 0 + } else { + metricsItem.AllocatedTflopsPercentToVirtualCap = metricsItem.AllocatedTflops / totalVirtualTflops * 100 + } + if totalVirtualVram <= 0 { + metricsItem.AllocatedVramPercentToVirtualCap = 0 + } else { + metricsItem.AllocatedVramPercentToVirtualCap = metricsItem.AllocatedVramBytes / totalVirtualVram * 100 + } +} + +// Start metrics recorder +// The leader container will fill the metrics map, so followers don't have metrics point +// thus metrics recorder only printed in one controller instance +// One minute interval could cause some metrics ignored or billing not accurate, known issue +func (mr *MetricsRecorder) Start() { + + ticker := time.NewTicker(time.Minute) + + writer := &lumberjack.Logger{ + Filename: mr.MetricsOutputPath, + MaxSize: 100, + MaxBackups: 10, + MaxAge: 28, + } + + // Record metrics + go func() { + for { + <-ticker.C + mr.RecordMetrics(writer) + } + }() + + // Clean up worker metrics that have been deleted + go func() { + for { + time.Sleep(5 * time.Minute) + workerMetricsLock.Lock() + for _, metrics := range workerMetricsMap { + if !metrics.DeletionTimestamp.IsZero() { + delete(workerMetricsMap, metrics.WorkerName) + } + } + workerMetricsLock.Unlock() + } + }() +} + +func (mr *MetricsRecorder) RecordMetrics(writer io.Writer) { + if len(workerMetricsMap) <= 0 && len(NodeMetricsMap) <= 0 { + return + } + + now := time.Now() + + var enc metricsProto.Encoder + enc.SetPrecision(metricsProto.Millisecond) + + workerMetricsLock.RLock() + + activeWorkerCnt := 0 + for _, metrics := range workerMetricsMap { + + if !metrics.DeletionTimestamp.IsZero() { + metrics.RawCost = mr.getWorkerRawCost(metrics, metrics.DeletionTimestamp.Sub(metrics.LastRecordTime)) + } else { + metrics.RawCost = mr.getWorkerRawCost(metrics, now.Sub(metrics.LastRecordTime)) + } + metrics.LastRecordTime = now + + // Skip recording metrics if raw cost is negative + // which means worker already deleted waiting for cleanup + if metrics.RawCost < 0 { + continue + } + activeWorkerCnt++ + enc.StartLine("tf_worker_metrics") + enc.AddTag("namespace", metrics.Namespace) + enc.AddTag("pool_name", metrics.PoolName) + enc.AddTag("qos", metrics.QoS) + enc.AddTag("worker_name", metrics.WorkerName) + enc.AddTag("workload_name", metrics.WorkloadName) + + enc.AddField("gpu_count", metricsProto.MustNewValue(int64(metrics.GPUCount))) + enc.AddField("tflops_limit", metricsProto.MustNewValue(metrics.TflopsLimit)) + enc.AddField("tflops_request", metricsProto.MustNewValue(metrics.TflopsRequest)) + enc.AddField("raw_cost", metricsProto.MustNewValue(metrics.RawCost)) + enc.AddField("vram_bytes_limit", metricsProto.MustNewValue(metrics.VramBytesLimit)) + enc.AddField("vram_bytes_request", metricsProto.MustNewValue(metrics.VramBytesRequest)) + + enc.EndLine(now) + } + enc.StartLine("tf_system_metrics") + enc.AddField("total_workers_cnt", metricsProto.MustNewValue(int64(activeWorkerCnt))) + workerMetricsLock.RUnlock() + + nodeMetricsLock.RLock() + for _, metrics := range NodeMetricsMap { + metrics.RawCost = mr.getNodeRawCost(metrics, now.Sub(metrics.LastRecordTime), mr.HourlyUnitPriceMap) + metrics.LastRecordTime = now + + enc.StartLine("tf_node_metrics") + + enc.AddTag("node_name", metrics.NodeName) + enc.AddTag("pool_name", metrics.PoolName) + + enc.AddField("allocated_tflops", metricsProto.MustNewValue(metrics.AllocatedTflops)) + enc.AddField("allocated_tflops_percent", metricsProto.MustNewValue(metrics.AllocatedTflopsPercent)) + enc.AddField("allocated_vram_bytes", metricsProto.MustNewValue(metrics.AllocatedVramBytes)) + enc.AddField("allocated_vram_percent", metricsProto.MustNewValue(metrics.AllocatedVramPercent)) + enc.AddField("gpu_count", metricsProto.MustNewValue(int64(len(metrics.GPUModels)))) + enc.AddField("raw_cost", metricsProto.MustNewValue(metrics.RawCost)) + enc.EndLine(now) + } + enc.StartLine("tf_system_metrics") + enc.AddField("total_nodes_cnt", metricsProto.MustNewValue(int64(len(NodeMetricsMap)))) + enc.EndLine(now) + + nodeMetricsLock.RUnlock() + + if err := enc.Err(); err != nil { + log.Error(err, "metrics encoding error", "workerCount", activeWorkerCnt, "nodeCount", len(NodeMetricsMap)) + } + + if _, err := writer.Write(enc.Bytes()); err != nil { + log.Error(err, "metrics writing error", "workerCount", activeWorkerCnt, "nodeCount", len(NodeMetricsMap)) + } + log.Info("metrics and raw billing recorded:", "workerCount", activeWorkerCnt, "nodeCount", len(NodeMetricsMap)) +} + +func (mr *MetricsRecorder) getWorkerRawCost(metrics *WorkerMetrics, duration time.Duration) float64 { + qosPricing, ok := mr.WorkerUnitPriceMap[metrics.PoolName] + // The qos pricing for this pool not set + if !ok { + return 0 + } + // The price of current qos not defined for this pool + qosLevel := metrics.QoS + if qosLevel == "" { + qosLevel = constants.QoSLevelMedium + } + pricing, ok := qosPricing[qosLevel] + if !ok { + return 0 + } + if duration < 0 { + return -1 + } + + rawCostTflopsLimitOverRequest := (metrics.TflopsLimit - metrics.TflopsRequest) * pricing.TflopsOverRequestPerSecond + rawCostPerTflops := pricing.TflopsPerSecond * metrics.TflopsRequest + + rawCostVRAMLimitOverRequest := (metrics.VramBytesLimit - metrics.VramBytesRequest) * pricing.VramOverRequestPerSecond / constants.GiBToBytes + rawCostPerVRAM := pricing.VramPerSecond * metrics.VramBytesRequest / constants.GiBToBytes + + return (rawCostPerTflops + rawCostPerVRAM + rawCostTflopsLimitOverRequest + rawCostVRAMLimitOverRequest) * duration.Seconds() * float64(metrics.GPUCount) +} + +// unit price data comes from global config map, and multi-GPU instance should normalized with per GPU pricing, e.g. 8xA100 p4d.24xlarge price should divide by 8 +func (mr *MetricsRecorder) getNodeRawCost(metrics *NodeMetrics, duration time.Duration, hourlyUnitPriceMap map[string]float64) float64 { + cost := 0.0 + for _, gpuModel := range metrics.GPUModels { + cost += metrics.AllocatedTflops * duration.Hours() * hourlyUnitPriceMap[gpuModel] + } + return cost +} diff --git a/internal/metrics/type.go b/internal/metrics/type.go new file mode 100644 index 00000000..fdde703a --- /dev/null +++ b/internal/metrics/type.go @@ -0,0 +1,53 @@ +package metrics + +import "time" + +// Metrics will be stored in a map, key is the worker name, value is the metrics +// By default, metrics will be updated every minute +type WorkerMetrics struct { + WorkerName string `json:"workerName"` + WorkloadName string `json:"workloadName"` + PoolName string `json:"poolName"` + Namespace string `json:"namespace"` + QoS string `json:"qos"` + + TflopsRequest float64 `json:"tflopsRequest"` + TflopsLimit float64 `json:"tflopsLimit"` + VramBytesRequest float64 `json:"vramBytesRequest"` + VramBytesLimit float64 `json:"vramBytesLimit"` + GPUCount int `json:"gpuCount"` + RawCost float64 `json:"rawCost"` + + LastRecordTime time.Time `json:"lastRecordTime"` + + // For more accurate metrics, should record the deletion timestamp to calculate duration for the last metrics + DeletionTimestamp time.Time `json:"deletionTimestamp"` +} + +type NodeMetrics struct { + NodeName string `json:"nodeName"` + PoolName string `json:"poolName"` + + AllocatedTflops float64 `json:"allocatedTflops"` + AllocatedTflopsPercent float64 `json:"allocatedTflopsPercent"` + AllocatedVramBytes float64 `json:"allocatedVramBytes"` + AllocatedVramPercent float64 `json:"allocatedVramPercent"` + + AllocatedTflopsPercentToVirtualCap float64 `json:"allocatedTflopsPercentToVirtualCap"` + AllocatedVramPercentToVirtualCap float64 `json:"allocatedVramPercentToVirtualCap"` + + RawCost float64 `json:"rawCost"` + + LastRecordTime time.Time `json:"lastRecordTime"` + + // additional field for raw cost calculation since each GPU has different price + GPUModels []string `json:"gpuModels"` +} + +type RawBillingPricing struct { + TflopsPerSecond float64 + VramPerSecond float64 + + TflopsOverRequestPerSecond float64 + VramOverRequestPerSecond float64 +} diff --git a/internal/metrics/worker.go b/internal/metrics/worker.go deleted file mode 100644 index 3e5bf843..00000000 --- a/internal/metrics/worker.go +++ /dev/null @@ -1,78 +0,0 @@ -package metrics - -import ( - "github.com/prometheus/client_golang/prometheus" - "sigs.k8s.io/controller-runtime/pkg/metrics" -) - -var ( - labels = []string{ - "namespace", "worker", "pool", - } - - nodeLabels = []string{ - "nodeName", "pool", - } - - GpuTflopsRequest = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Name: "gpu_tflops_request", - }, - labels, - ) - - GpuTflopsLimit = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Name: "gpu_tflops_limit", - }, - labels, - ) - - VramBytesRequest = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Name: "vram_bytes_request", - }, - labels, - ) - - VramBytesLimit = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Name: "vram_bytes_limit", - }, - labels, - ) - - GpuCount = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Name: "gpu_count", - Help: "Number of GPUs allocated to the workload", - }, - labels, - ) - - AllocatedTflopsPercent = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Name: "allocated_compute_percentage", - }, - nodeLabels, - ) - - AllocatedVramBytes = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Name: "allocated_memory_bytes", - }, - nodeLabels, - ) -) - -func init() { - metrics.Registry.MustRegister( - GpuTflopsRequest, - GpuTflopsLimit, - VramBytesRequest, - VramBytesLimit, - AllocatedTflopsPercent, - AllocatedVramBytes, - GpuCount, - ) -} diff --git a/internal/portallocator/portallocator.go b/internal/portallocator/portallocator.go new file mode 100644 index 00000000..72707c22 --- /dev/null +++ b/internal/portallocator/portallocator.go @@ -0,0 +1,268 @@ +package portallocator + +import ( + "context" + "fmt" + "math/bits" + "strconv" + "strings" + "sync" + + "github.com/NexusGPU/tensor-fusion/internal/constants" + "github.com/NexusGPU/tensor-fusion/internal/utils" + v1 "k8s.io/api/core/v1" + "k8s.io/client-go/util/retry" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/manager" +) + +// Offer API for host port allocation, range from user configured port range +// Use label: `tensor-fusion.ai/host-port: auto` to assigned port at cluster level +// vGPU worker's hostPort will be managed by operator +type PortAllocator struct { + PortRangeStartNode int + PortRangeEndNode int + + PortRangeStartCluster int + PortRangeEndCluster int + + IsLeader bool + + BitmapPerNode map[string][]uint64 + BitmapCluster []uint64 + + Client client.Client + + storeMutexNode sync.RWMutex + storeMutexCluster sync.RWMutex +} + +func NewPortAllocator(ctx context.Context, client client.Client, nodeLevelPortRange string, clusterLevelPortRange string) (*PortAllocator, error) { + if client == nil { + return nil, fmt.Errorf("client cannot be nil") + } + + nodeLevelRange := strings.Split(nodeLevelPortRange, "-") + clusterLevelRange := strings.Split(clusterLevelPortRange, "-") + + portRangeStartNode, _ := strconv.Atoi(nodeLevelRange[0]) + portRangeEndNode, _ := strconv.Atoi(nodeLevelRange[1]) + + portRangeStartCluster, _ := strconv.Atoi(clusterLevelRange[0]) + portRangeEndCluster, _ := strconv.Atoi(clusterLevelRange[1]) + + allocator := &PortAllocator{ + PortRangeStartNode: portRangeStartNode, + PortRangeEndNode: portRangeEndNode, + PortRangeStartCluster: portRangeStartCluster, + PortRangeEndCluster: portRangeEndCluster, + Client: client, + IsLeader: false, + BitmapPerNode: make(map[string][]uint64), + BitmapCluster: make([]uint64, (portRangeEndCluster-portRangeStartCluster)/64+1), + + storeMutexNode: sync.RWMutex{}, + storeMutexCluster: sync.RWMutex{}, + } + + return allocator, nil +} + +func (s *PortAllocator) SetupWithManager(ctx context.Context, mgr manager.Manager) <-chan struct{} { + readyCh := make(chan struct{}, 1) + _ = mgr.Add(manager.RunnableFunc(func(ctx context.Context) error { + <-mgr.Elected() + s.IsLeader = true + leaderInfo := &v1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: constants.LeaderInfoConfigMapName, + Namespace: utils.CurrentNamespace(), + }, + } + err := retry.RetryOnConflict(retry.DefaultBackoff, func() error { + _, err := controllerutil.CreateOrUpdate(ctx, s.Client, leaderInfo, func() error { + leaderInfo.Data = map[string]string{ + constants.LeaderInfoConfigMapLeaderIPKey: utils.CurrentIP(), + } + return nil + }) + return err + }) + if err != nil { + log.FromContext(ctx).Error(err, "Failed to update leader IP info in ConfigMap") + } + + s.storeMutexNode.Lock() + s.storeMutexCluster.Lock() + defer s.storeMutexNode.Unlock() + defer s.storeMutexCluster.Unlock() + + // 1. init bit map from existing pods labeled with tensor-fusion.ai/host-port=auto + s.initBitMapForClusterLevelPortAssign(ctx) + + // 2. init bit map for existing vGPU workers + s.initBitMapForNodeLevelPortAssign(ctx) + + readyCh <- struct{}{} + return nil + })) + return readyCh +} + +func (s *PortAllocator) GetLeaderIP() string { + leaderInfo := &v1.ConfigMap{} + err := s.Client.Get(context.Background(), client.ObjectKey{ + Name: constants.LeaderInfoConfigMapName, + Namespace: utils.CurrentNamespace(), + }, leaderInfo) + if err != nil { + log.FromContext(context.Background()).Error(err, "Failed to get leader IP info from ConfigMap") + return "" + } + if leaderInfo.Data == nil { + return "" + } + return leaderInfo.Data[constants.LeaderInfoConfigMapLeaderIPKey] +} + +// AssignHostPort always called by operator itself, thus no Leader-Follower inconsistency issue +func (s *PortAllocator) AssignHostPort(nodeName string) (int, error) { + if nodeName == "" { + return 0, fmt.Errorf("node name cannot be empty when assign host port") + } + s.storeMutexNode.Lock() + defer s.storeMutexNode.Unlock() + + bitmap, ok := s.BitmapPerNode[nodeName] + if !ok { + // found new nodes not have any ports assigned before + bitmapSize := (s.PortRangeEndNode - s.PortRangeStartNode + 63) / 64 + s.BitmapPerNode[nodeName] = make([]uint64, bitmapSize) + bitmap = s.BitmapPerNode[nodeName] + } + for i, subMap := range bitmap { + bitPos := bits.TrailingZeros64(^subMap) + portOffset := i*64 + bitPos + if subMap != 0xFFFFFFFFFFFFFFFF { + assignedPort := portOffset + s.PortRangeStartNode + if assignedPort < s.PortRangeEndNode { + bitmap[i] = subMap | (1 << bitPos) + return assignedPort, nil + } else { + break + } + } + } + return 0, fmt.Errorf("no available port on node %s", nodeName) + +} + +func (s *PortAllocator) ReleaseHostPort(nodeName string, port int) error { + if port == 0 { + return fmt.Errorf("port cannot be 0 when release host port, may caused by portNumber annotation not detected, nodeName: %s", nodeName) + } + s.storeMutexNode.Lock() + defer s.storeMutexNode.Unlock() + + if bitmap, ok := s.BitmapPerNode[nodeName]; !ok { + return fmt.Errorf("node %s not found in bitmap", nodeName) + } else { + portOffset := port - s.PortRangeStartNode + bitmap[portOffset/64] &^= 1 << (portOffset % 64) + } + return nil +} + +func (s *PortAllocator) AssignClusterLevelHostPort(podName string) (int, error) { + + s.storeMutexCluster.Lock() + defer s.storeMutexCluster.Unlock() + + for i, subMap := range s.BitmapCluster { + bitPos := bits.TrailingZeros64(^subMap) + portOffset := i*64 + bitPos + if subMap != 0xFFFFFFFFFFFFFFFF { + assignedPort := portOffset + s.PortRangeStartCluster + if assignedPort < s.PortRangeEndCluster { + s.BitmapCluster[i] |= 1 << bitPos + return assignedPort, nil + } + } + } + return 0, fmt.Errorf("no available port on cluster") +} + +func (s *PortAllocator) ReleaseClusterLevelHostPort(podName string, port int) error { + if port == 0 { + return fmt.Errorf("port cannot be 0 when release host port, may caused by portNumber annotation not detected, podName: %s", podName) + } + + // TODO, may need a defer queue for releasing so that to avoid port being assigned again too fast + + s.storeMutexCluster.Lock() + defer s.storeMutexCluster.Unlock() + + portOffset := port - s.PortRangeStartCluster + s.BitmapCluster[portOffset/64] &^= 1 << (portOffset % 64) + return nil +} + +func (s *PortAllocator) initBitMapForClusterLevelPortAssign(ctx context.Context) { + log := log.FromContext(ctx) + podList := &v1.PodList{} + err := s.Client.List(ctx, podList, client.MatchingLabels{constants.GenHostPortLabel: constants.GenHostPortLabelValue}) + if err != nil { + log.Error(err, "failed to list pods with port allocation label") + return + } + usedPorts := []uint16{} + for _, pod := range podList.Items { + if pod.Annotations == nil { + continue + } + port, _ := strconv.Atoi(pod.Annotations[constants.GenPortNumberAnnotation]) + if port > s.PortRangeEndCluster || port < s.PortRangeStartCluster { + log.Error(err, "existing Pod's host port out of range", "port", port, "expected-start", s.PortRangeStartCluster, "expected-end", s.PortRangeEndCluster, "pod", pod.Name) + continue + } + bitOffSet := port - s.PortRangeStartCluster + + usedPorts = append(usedPorts, uint16(bitOffSet)) + } + + for _, port := range usedPorts { + s.BitmapCluster[port/64] |= 1 << (port % 64) + } +} + +func (s *PortAllocator) initBitMapForNodeLevelPortAssign(ctx context.Context) { + log := log.FromContext(ctx) + podList := &v1.PodList{} + err := s.Client.List(ctx, podList, client.MatchingLabels{constants.LabelComponent: constants.ComponentWorker}) + if err != nil { + log.Error(err, "failed to list pods with port allocation label") + return + } + + size := (s.PortRangeEndNode-s.PortRangeStartNode)/64 + 1 + for _, pod := range podList.Items { + if pod.Annotations == nil { + continue + } + port, _ := strconv.Atoi(pod.Annotations[constants.GenPortNumberAnnotation]) + if port > s.PortRangeEndNode || port < s.PortRangeStartNode { + log.Error(err, "existing Pod's node level host port out of range", "port", port, "expected-start", s.PortRangeStartNode, "expected-end", s.PortRangeEndNode, "pod", pod.Name, "node", pod.Spec.NodeName) + continue + } + bitOffSet := port - s.PortRangeStartNode + if _, ok := s.BitmapPerNode[pod.Spec.NodeName]; !ok { + s.BitmapPerNode[pod.Spec.NodeName] = make([]uint64, size) + } + s.BitmapPerNode[pod.Spec.NodeName][bitOffSet/64] |= 1 << (bitOffSet % 64) + } + +} diff --git a/internal/portallocator/portallocator_suite_test.go b/internal/portallocator/portallocator_suite_test.go new file mode 100644 index 00000000..ec9b4566 --- /dev/null +++ b/internal/portallocator/portallocator_suite_test.go @@ -0,0 +1,158 @@ +package portallocator + +import ( + "context" + "fmt" + "path/filepath" + "runtime" + "testing" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/constants" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/envtest" + logf "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" +) + +// These tests use Ginkgo (BDD-style Go testing framework). Refer to +// http://onsi.github.io/ginkgo/ to learn more about Ginkgo. + +var ( + cancel context.CancelFunc + cfg *rest.Config + ctx context.Context + k8sClient client.Client + testEnv *envtest.Environment + mgr ctrl.Manager + pa *PortAllocator +) + +func TestPortAllocator(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Port Allocator Suite") +} +func genHostPortPod(name string, nodeName string, port int32, clusterLevel bool) corev1.Pod { + var labels map[string]string + if clusterLevel { + labels = map[string]string{ + constants.GenHostPortLabel: constants.GenHostPortLabelValue, + constants.GenHostPortNameLabel: "test", + constants.LabelKeyOwner: nodeName, + } + } else { + labels = map[string]string{ + constants.LabelComponent: constants.ComponentWorker, + constants.LabelKeyOwner: nodeName, + } + } + return corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: "default", + Labels: labels, + Annotations: map[string]string{ + constants.GenPortNumberAnnotation: fmt.Sprintf("%d", port), + }, + }, + Spec: corev1.PodSpec{ + NodeName: nodeName, + Containers: []corev1.Container{ + { + Name: "test", + Image: "test-image", + Ports: []corev1.ContainerPort{ + { + Name: "test", + ContainerPort: 80, + HostPort: port, + }, + }, + }, + }, + }, + } +} + +var _ = BeforeSuite(func() { + logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true))) + + ctx, cancel = context.WithCancel(context.TODO()) + + By("bootstrapping test environment") + testEnv = &envtest.Environment{ + CRDDirectoryPaths: []string{filepath.Join("..", "..", "config", "crd", "bases")}, + ErrorIfCRDPathMissing: false, + BinaryAssetsDirectory: filepath.Join("..", "..", "bin", "k8s", + fmt.Sprintf("1.31.0-%s-%s", runtime.GOOS, runtime.GOARCH)), + } + + var err error + // cfg is defined in this file globally. + cfg, err = testEnv.Start() + Expect(err).NotTo(HaveOccurred()) + Expect(cfg).NotTo(BeNil()) + + err = tfv1.AddToScheme(scheme.Scheme) + Expect(err).NotTo(HaveOccurred()) + + err = corev1.AddToScheme(scheme.Scheme) + Expect(err).NotTo(HaveOccurred()) + + // Create a Kubernetes client + k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme}) + Expect(err).NotTo(HaveOccurred()) + Expect(k8sClient).NotTo(BeNil()) + + mgr, err = ctrl.NewManager(cfg, ctrl.Options{ + Scheme: scheme.Scheme, + Metrics: metricsserver.Options{ + BindAddress: "0", + }, + }) + Expect(err).NotTo(HaveOccurred()) + + // Create test GPUs with metadata only first + workers := []corev1.Pod{ + genHostPortPod("worker-1", "node-1", 40000, false), + genHostPortPod("worker-2", "node-1", 40001, false), + genHostPortPod("worker-3", "node-1", 40127, false), + genHostPortPod("worker-4", "node-2", 40003, false), + genHostPortPod("worker-5", "node-2", 40065, false), + genHostPortPod("lab-1", "node-1", 42001, true), + genHostPortPod("lab-2", "node-1", 59999, true), + } + + // First create the GPUs without status + for i := range workers { + err = k8sClient.Create(ctx, &workers[i]) + Expect(err).NotTo(HaveOccurred()) + } + + pa, err = NewPortAllocator(ctx, k8sClient, "40000-42000", "42001-60000") + Expect(err).NotTo(HaveOccurred()) + readyCh := pa.SetupWithManager(ctx, mgr) + Expect(err).NotTo(HaveOccurred()) + + go func() { + defer GinkgoRecover() + err = mgr.Start(ctx) + Expect(err).ToNot(HaveOccurred(), "failed to run manager") + }() + <-readyCh +}) + +var _ = AfterSuite(func() { + By("tearing down the test environment") + cancel() + err := testEnv.Stop() + Expect(err).NotTo(HaveOccurred()) +}) diff --git a/internal/portallocator/portallocator_test.go b/internal/portallocator/portallocator_test.go new file mode 100644 index 00000000..cccd58f0 --- /dev/null +++ b/internal/portallocator/portallocator_test.go @@ -0,0 +1,166 @@ +package portallocator + +import ( + "fmt" + "strconv" + "sync" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("Port Allocator", func() { + BeforeEach(func() { + // Reset state before each test + // This is important to ensure tests don't interfere with each other + // We're using the existing pa instance from the suite setup + }) + + Context("AssignHostPort", func() { + It("should assign a valid port for a node", func() { + port, err := pa.AssignHostPort("node-1") + Expect(err).NotTo(HaveOccurred()) + Expect(port).To(Equal(40002)) + + port, err = pa.AssignHostPort("node-1") + Expect(err).NotTo(HaveOccurred()) + Expect(port).To(Equal(40003)) + + err = pa.ReleaseHostPort("node-1", 40002) + Expect(err).NotTo(HaveOccurred()) + + port, err = pa.AssignHostPort("node-1") + Expect(err).NotTo(HaveOccurred()) + Expect(port).To(Equal(40002)) + + port, err = pa.AssignHostPort("node-new") + Expect(err).NotTo(HaveOccurred()) + Expect(port).To(Equal(40000)) + }) + + It("should fail when node name is empty", func() { + _, err := pa.AssignHostPort("") + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("node name cannot be empty")) + }) + + It("should exhaust ports and return error when no ports available", func() { + // Create a node with a small port range for testing exhaustion + nodeName := "exhaust-test-node" + + // Assign ports until we get an error + var lastPort int + var err error + assignedPorts := make(map[int]bool) + + // Keep assigning ports until we get an error or hit a reasonable limit + for i := 0; i < 2002; i++ { + lastPort, err = pa.AssignHostPort(nodeName) + if err != nil { + break + } + + // Verify we don't get duplicate ports + Expect(assignedPorts).NotTo(HaveKey(lastPort)) + assignedPorts[lastPort] = true + } + + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("no available port")) + }) + }) + + Context("ReleaseHostPort", func() { + It("should release a port successfully", func() { + nodeName := "release-test-node" + port, err := pa.AssignHostPort(nodeName) + Expect(err).NotTo(HaveOccurred()) + + err = pa.ReleaseHostPort(nodeName, port) + Expect(err).NotTo(HaveOccurred()) + }) + + It("should fail to release ports with invalid parameters", func() { + tests := []struct { + description string + node string + port int + errorMsg string + }{ + {"invalid node name", "invalid-node-name", 40001, "node invalid-node-name not found"}, + {"port is zero", "node-1", 0, "port cannot be 0 when release host port"}, + } + + for _, tc := range tests { + By(tc.description) + err := pa.ReleaseHostPort(tc.node, tc.port) + Expect(err).To(HaveOccurred()) + if tc.errorMsg != "" { + Expect(err.Error()).To(ContainSubstring(tc.errorMsg)) + } + } + }) + }) + + Context("Cluster Level Port Allocation", func() { + It("should assign and release cluster level ports", func() { + podName := "test-cluster-pod" + port, err := pa.AssignClusterLevelHostPort(podName) + Expect(err).NotTo(HaveOccurred()) + Expect(port).To(Equal(42002)) + + err = pa.ReleaseClusterLevelHostPort(podName, port) + Expect(err).NotTo(HaveOccurred()) + + err = pa.ReleaseClusterLevelHostPort(podName, 59999) + Expect(err).NotTo(HaveOccurred()) + + port, err = pa.AssignClusterLevelHostPort(podName) + Expect(err).NotTo(HaveOccurred()) + Expect(port).To(Equal(42002)) + }) + + It("should fail to release a cluster port with invalid parameters", func() { + err := pa.ReleaseClusterLevelHostPort("test-pod", 0) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("port cannot be 0 when release host port")) + }) + }) + + Context("Concurrency", func() { + It("should handle concurrent port assignments and releases", func() { + const workers = 20 + var wg sync.WaitGroup + results := make(chan error, workers) + + wg.Add(workers) + for i := 0; i < workers; i++ { + go func(i int) { + defer wg.Done() + node := "concurrent-node-" + strconv.Itoa(i%5) + _, err := pa.AssignHostPort(node) + if err != nil { + results <- fmt.Errorf("assignment failed: %v", err) + return + } + }(i) + } + + // Wait for all goroutines to complete + wg.Wait() + + for i := 0; i < 5; i++ { + bitMap := pa.BitmapPerNode["concurrent-node-"+strconv.Itoa(i)] + Expect(bitMap).To(HaveLen(32)) + Expect(bitMap[0]).To(Equal(uint64(0xf))) + } + + close(results) + + // Check for any errors + for err := range results { + Expect(err).NotTo(HaveOccurred()) + } + }) + }) +}) diff --git a/internal/server/router/assign_host_port.go b/internal/server/router/assign_host_port.go new file mode 100644 index 00000000..8633c9c6 --- /dev/null +++ b/internal/server/router/assign_host_port.go @@ -0,0 +1,33 @@ +package router + +import ( + "context" + "fmt" + "net/http" + + "github.com/NexusGPU/tensor-fusion/internal/portallocator" + "github.com/gin-gonic/gin" + "sigs.k8s.io/controller-runtime/pkg/log" +) + +type AssignHostPortRouter struct { + allocator *portallocator.PortAllocator +} + +func NewAssignHostPortRouter(ctx context.Context, allocator *portallocator.PortAllocator) (*AssignHostPortRouter, error) { + return &AssignHostPortRouter{allocator: allocator}, nil +} + +func (r *AssignHostPortRouter) AssignHostPort(ctx *gin.Context) { + // TODO verify service account token, issuer must be the same as current instance + // namely the request must comes from peer operator Pod + + podName := ctx.Query("podName") + port, err := r.allocator.AssignClusterLevelHostPort(podName) + if err != nil { + ctx.String(http.StatusInternalServerError, err.Error()) + return + } + log.FromContext(ctx).Info("assigned host port", "podName", podName, "port", port) + ctx.String(http.StatusOK, fmt.Sprintf("%d", port)) +} diff --git a/internal/server/router/connection.go b/internal/server/router/connection.go index 5008048d..7b3a7661 100644 --- a/internal/server/router/connection.go +++ b/internal/server/router/connection.go @@ -117,11 +117,13 @@ func (cw *connectionWatcher) subscribe(req types.NamespacedName) (connectionChan func (cw *connectionWatcher) watchConnections(ctx context.Context, watcher watch.Interface) { // Watch for changes defer watcher.Stop() + watcherChan := watcher.ResultChan() for { + select { case <-ctx.Done(): return - case event, ok := <-watcher.ResultChan(): + case event, ok := <-watcherChan: if !ok { return } diff --git a/internal/server/server.go b/internal/server/server.go index 040a94c6..816d7c3f 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -8,6 +8,7 @@ import ( func NewHTTPServer( cr *router.ConnectionRouter, + ahp *router.AssignHostPortRouter, ) *gin.Engine { r := gin.New() @@ -17,5 +18,6 @@ func NewHTTPServer( apiGroup := r.Group("/api") apiGroup.GET("/connection", cr.Get) + apiGroup.POST("/assign-host-port", ahp.AssignHostPort) return r } diff --git a/internal/utils/net.go b/internal/utils/net.go new file mode 100644 index 00000000..c8940c04 --- /dev/null +++ b/internal/utils/net.go @@ -0,0 +1,37 @@ +package utils + +import "net" + +func CurrentIP() string { + interfaces, err := net.Interfaces() + if err != nil { + panic(err) + } + + for _, iface := range interfaces { + if iface.Flags&net.FlagUp == 0 || iface.Flags&net.FlagLoopback != 0 { + continue + } + + addrs, err := iface.Addrs() + if err != nil { + continue + } + + for _, addr := range addrs { + ipNet, ok := addr.(*net.IPNet) + if !ok { + continue + } + + ip := ipNet.IP + if ip.IsLoopback() || ip.To4() == nil { + continue + } + + return ip.String() + } + } + + panic("no internal IP address found") +} diff --git a/internal/utils/reconcile.go b/internal/utils/reconcile.go index bcd80239..eb7da3b7 100644 --- a/internal/utils/reconcile.go +++ b/internal/utils/reconcile.go @@ -29,6 +29,15 @@ var ErrNextLoop = errors.New("stop this loop and return the associated Result ob // ErrTerminateLoop is not a real error. It forces the current reconciliation loop to stop var ErrTerminateLoop = errors.New("stop this loop and do not requeue") +// Minimum time between reconciliations for the same object +var debounceInterval = 3 * time.Second + +func init() { + if os.Getenv("GO_TESTING") == "true" { + debounceInterval = 60 * time.Millisecond + } +} + // HandleFinalizer ensures proper finalizer management for Kubernetes resources. // It automatically adds the finalizer when needed, and removes it after successful cleanup. // Returns (shouldReturn, err): @@ -147,10 +156,6 @@ func CompareAndGetObjectHash(hash string, obj ...any) (bool, string) { const DebounceKeySuffix = ":in_queue" func DebouncedReconcileCheck(ctx context.Context, lastProcessedItems *sync.Map, name types.NamespacedName) (runNow bool, alreadyQueued bool, waitTime time.Duration) { - const ( - // Minimum time between reconciliations for the same object - debounceInterval = 3 * time.Second - ) now := time.Now() key := name.String() inQueueKey := key + DebounceKeySuffix diff --git a/internal/webhook/v1/pod_webhook.go b/internal/webhook/v1/pod_webhook.go index cd460220..57bb13eb 100644 --- a/internal/webhook/v1/pod_webhook.go +++ b/internal/webhook/v1/pod_webhook.go @@ -19,8 +19,12 @@ package v1 import ( "context" "encoding/json" + goErrors "errors" "fmt" + "io" "net/http" + "strconv" + "time" "gomodules.xyz/jsonpatch/v2" corev1 "k8s.io/api/core/v1" @@ -36,6 +40,7 @@ import ( "al.essio.dev/pkg/shellescape" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" "github.com/NexusGPU/tensor-fusion/internal/constants" + "github.com/NexusGPU/tensor-fusion/internal/portallocator" "github.com/NexusGPU/tensor-fusion/internal/utils" "github.com/NexusGPU/tensor-fusion/internal/worker" "github.com/lithammer/shortuuid/v4" @@ -43,22 +48,24 @@ import ( ) // SetupPodWebhookWithManager registers the webhook for Pod in the manager. -func SetupPodWebhookWithManager(mgr ctrl.Manager) error { +func SetupPodWebhookWithManager(mgr ctrl.Manager, portAllocator *portallocator.PortAllocator) error { webhookServer := mgr.GetWebhookServer() webhookServer.Register("/mutate-v1-pod", &admission.Webhook{ Handler: &TensorFusionPodMutator{ - decoder: admission.NewDecoder(runtime.NewScheme()), - Client: mgr.GetClient(), + decoder: admission.NewDecoder(runtime.NewScheme()), + Client: mgr.GetClient(), + portAllocator: portAllocator, }, }) return nil } type TensorFusionPodMutator struct { - Client client.Client - decoder admission.Decoder + Client client.Client + decoder admission.Decoder + portAllocator *portallocator.PortAllocator } // Handle implements admission.Handler interface. @@ -97,18 +104,18 @@ func (m *TensorFusionPodMutator) Handle(ctx context.Context, req admission.Reque podCounterAnnotationKey = podCounterKey } + pool := &tfv1.GPUPool{} + if err := m.Client.Get(ctx, client.ObjectKey{Name: tfInfo.Profile.PoolName}, pool); err != nil { + return admission.Errored(http.StatusInternalServerError, fmt.Errorf("gpu pool(%s) does not exist", tfInfo.Profile.PoolName)) + } + workload := &tfv1.TensorFusionWorkload{} if tfInfo.GenWorkload { - if err := m.createOrUpdateWorkload(ctx, pod, &tfInfo, workload); err != nil { + if err := m.createOrUpdateWorkload(ctx, pod, &tfInfo, workload, pool); err != nil { return admission.Errored(http.StatusInternalServerError, fmt.Errorf("create tf workload: %w", err)) } } - pool := &tfv1.GPUPool{} - if err := m.Client.Get(ctx, client.ObjectKey{Name: tfInfo.Profile.PoolName}, pool); err != nil { - return admission.Errored(http.StatusInternalServerError, fmt.Errorf("gpu pool(%s) does not exist", tfInfo.Profile.PoolName)) - } - var nodeSelector map[string]string if tfInfo.Profile.IsLocalGPU { if !tfInfo.GenWorkload { @@ -116,12 +123,26 @@ func (m *TensorFusionPodMutator) Handle(ctx context.Context, req admission.Reque return admission.Errored(http.StatusInternalServerError, fmt.Errorf("workload(%s) does not exist", tfInfo.WorkloadName)) } } - workloadStatus, err := worker.SelectWorker(ctx, m.Client, workload, 1) - if err != nil { - log.Error(err, "failed to select worker for pod", "pod", req.Name, "namespace", req.Namespace) - return admission.Errored(http.StatusInternalServerError, fmt.Errorf("select worker: %w", err)) + + workerFound := false + for i := 0; i < 25; i++ { + workloadStatus, err := worker.SelectWorker(ctx, m.Client, workload, 1) + if err != nil { + if goErrors.Is(err, worker.ErrNoAvailableWorker) { + time.Sleep(time.Second) + continue + } + log.Error(err, "failed to select worker for pod", "pod", req.Name, "namespace", req.Namespace) + return admission.Errored(http.StatusInternalServerError, fmt.Errorf("select worker: %w", err)) + } + nodeSelector = workloadStatus.NodeSelector + workerFound = true + break + } + + if !workerFound { + return admission.Errored(http.StatusInternalServerError, fmt.Errorf("no available worker for pod: %s", req.Name)) } - nodeSelector = workloadStatus.NodeSelector } // Inject initContainer and env variables @@ -153,10 +174,12 @@ func (m *TensorFusionPodMutator) InjectDecoder(d admission.Decoder) error { return nil } -func (m *TensorFusionPodMutator) createOrUpdateWorkload(ctx context.Context, pod *corev1.Pod, tfInfo *TensorFusionInfo, workload *tfv1.TensorFusionWorkload) error { +func (m *TensorFusionPodMutator) createOrUpdateWorkload(ctx context.Context, pod *corev1.Pod, tfInfo *TensorFusionInfo, workload *tfv1.TensorFusionWorkload, pool *tfv1.GPUPool) error { // Check if workload exists err := m.Client.Get(ctx, client.ObjectKey{Name: tfInfo.WorkloadName, Namespace: pod.Namespace}, workload) + qos := calculateQoSLevel(tfInfo.Profile, pool) + if err != nil { if !errors.IsNotFound(err) { return fmt.Errorf("failed to get workload: %w", err) @@ -182,7 +205,7 @@ func (m *TensorFusionPodMutator) createOrUpdateWorkload(ctx context.Context, pod PoolName: tfInfo.Profile.PoolName, Resources: tfInfo.Profile.Resources, GPUCount: tfInfo.Profile.GPUCount, - Qos: tfInfo.Profile.Qos, + Qos: qos, GPUModel: tfInfo.Profile.GPUModel, IsLocalGPU: tfInfo.Profile.IsLocalGPU, }, @@ -210,7 +233,7 @@ func (m *TensorFusionPodMutator) createOrUpdateWorkload(ctx context.Context, pod Replicas: &replicas, PoolName: tfInfo.Profile.PoolName, Resources: tfInfo.Profile.Resources, - Qos: tfInfo.Profile.Qos, + Qos: qos, IsLocalGPU: tfInfo.Profile.IsLocalGPU, GPUCount: tfInfo.Profile.GPUCount, GPUModel: tfInfo.Profile.GPUModel, @@ -266,8 +289,16 @@ func (m *TensorFusionPodMutator) patchTFClient( pod.Labels = map[string]string{} } pod.Labels[constants.LabelKeyPodTemplateHash] = utils.GetObjectHash(clientConfig) + pod.Labels[constants.LabelComponent] = constants.ComponentClient pod.Labels[constants.GpuPoolKey] = pool.Name + // Patch hostPort allocation + if pod.Labels[constants.GenHostPortLabel] == constants.GenHostPortLabelValue { + if err := m.generateHostPort(pod, pod.Labels[constants.GenHostPortNameLabel]); err != nil { + return nil, fmt.Errorf("can not generate host port: %w", err) + } + } + containerPatched := false // Patch to Container for _, name := range containerNames { @@ -374,3 +405,93 @@ func (m *TensorFusionPodMutator) patchTFClient( patches = append(patches, strategicpatches...) return patches, nil } + +func (m *TensorFusionPodMutator) generateHostPort(pod *corev1.Pod, portName string) error { + + portNameFound := false + containerIndex := -1 + portIndex := -1 + for i := range pod.Spec.Containers { + container := &pod.Spec.Containers[i] + for j := range container.Ports { + port := &container.Ports[j] + if port.Name == portName { + portNameFound = true + containerIndex = i + portIndex = j + } + } + } + if !portNameFound { + return fmt.Errorf("port name %s not found, can not assign host port for pod %s", portName, pod.Name) + } + + if !m.portAllocator.IsLeader { + port, err := m.assignClusterHostPortFromLeader(pod) + if err != nil { + return fmt.Errorf("can not assign cluster host port from leader: %w", err) + } + pod.Annotations[constants.GenPortNumberAnnotation] = strconv.Itoa(port) + } else { + port, err := m.portAllocator.AssignClusterLevelHostPort(pod.Name) + if err != nil { + return fmt.Errorf("can not assign cluster level host port: %w", err) + } + pod.Annotations[constants.GenPortNumberAnnotation] = strconv.Itoa(port) + } + + pod.Spec.Containers[containerIndex].Ports[portIndex].HostPort = int32(m.getPortNumber(pod)) + return nil +} + +func (m *TensorFusionPodMutator) getPortNumber(pod *corev1.Pod) int { + portNumber, _ := strconv.Atoi(pod.Annotations[constants.GenPortNumberAnnotation]) + return portNumber +} + +func (m *TensorFusionPodMutator) assignClusterHostPortFromLeader(pod *corev1.Pod) (int, error) { + client := &http.Client{Timeout: 10 * time.Second} + leaderIP := m.portAllocator.GetLeaderIP() + if leaderIP == "" { + return 0, fmt.Errorf("operator leader IP not found") + } + + url := fmt.Sprintf("http://%s:8080/assign-host-port?podName=%s", leaderIP, pod.Name) + resp, err := client.Get(url) + if err != nil { + return 0, fmt.Errorf("failed to assign host port: %w", err) + } + defer func() { + _ = resp.Body.Close() + }() + + if resp.StatusCode != http.StatusOK { + return 0, fmt.Errorf("host port allocation failed: %s", resp.Status) + } + + body, err := io.ReadAll(resp.Body) + if err != nil { + return 0, fmt.Errorf("failed to read allocation response: %w", err) + } + + return strconv.Atoi(string(body)) +} + +func calculateQoSLevel(profile *tfv1.WorkloadProfileSpec, pool *tfv1.GPUPool) tfv1.QoSLevel { + sameReqLimits := profile.Resources.Limits.Tflops.Value() == profile.Resources.Requests.Tflops.Value() && + profile.Resources.Limits.Vram.Value() == profile.Resources.Requests.Vram.Value() + + // set to critical if req == limits, same logic as Kubernetes QoS + if sameReqLimits { + return constants.QoSLevelCritical + } + + // when not set, assign default QoS + if profile.Qos == "" { + if pool.Spec.QosConfig == nil || pool.Spec.QosConfig.DefaultQoS == "" { + return constants.QoSLevelMedium + } + return pool.Spec.QosConfig.DefaultQoS + } + return profile.Qos +} diff --git a/internal/webhook/v1/webhook_suite_test.go b/internal/webhook/v1/webhook_suite_test.go index 25eb459f..4e5d369b 100644 --- a/internal/webhook/v1/webhook_suite_test.go +++ b/internal/webhook/v1/webhook_suite_test.go @@ -28,6 +28,7 @@ import ( tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" "github.com/NexusGPU/tensor-fusion/internal/config" + "github.com/NexusGPU/tensor-fusion/internal/portallocator" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -61,7 +62,6 @@ var ( func TestAPIs(t *testing.T) { RegisterFailHandler(Fail) - RunSpecs(t, "Webhook Suite") } @@ -134,7 +134,11 @@ var _ = BeforeSuite(func() { }) Expect(err).NotTo(HaveOccurred()) - err = SetupPodWebhookWithManager(mgr) + err = SetupPodWebhookWithManager(mgr, &portallocator.PortAllocator{ + PortRangeStartCluster: 42000, + PortRangeEndCluster: 62000, + BitmapCluster: make([]uint64, (62000-42000)/64+1), + }) Expect(err).NotTo(HaveOccurred()) // +kubebuilder:scaffold:webhook diff --git a/internal/worker/worker.go b/internal/worker/worker.go index 1d447176..6bb56569 100644 --- a/internal/worker/worker.go +++ b/internal/worker/worker.go @@ -3,34 +3,37 @@ package worker import ( "context" "encoding/json" + "errors" "fmt" "math" "strconv" "strings" - "time" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" "github.com/NexusGPU/tensor-fusion/internal/config" "github.com/NexusGPU/tensor-fusion/internal/constants" "github.com/NexusGPU/tensor-fusion/internal/utils" "github.com/samber/lo" - "golang.org/x/exp/rand" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/client" ) -func init() { - rand.Seed(uint64(time.Now().UnixNano())) -} - type WorkerGenerator struct { GpuInfos *[]config.GpuInfo WorkerConfig *tfv1.WorkerConfig } +var ErrNoAvailableWorker = errors.New("no available worker") + func (wg *WorkerGenerator) WorkerPort(pod *corev1.Pod) (int, error) { - port, ok := lo.Find(pod.Spec.Containers[0].Env, func(env corev1.EnvVar) bool { + portAnnotation, ok := pod.Annotations[constants.GenPortNumberAnnotation] + if ok { + return strconv.Atoi(portAnnotation) + } + + // Compatible with old version in which no annotation in worker Pod + portEnv, ok := lo.Find(pod.Spec.Containers[0].Env, func(env corev1.EnvVar) bool { return env.Name == constants.WorkerPortEnv }) @@ -38,13 +41,7 @@ func (wg *WorkerGenerator) WorkerPort(pod *corev1.Pod) (int, error) { return 0, fmt.Errorf("worker port not found in pod %s", pod.Name) } - return strconv.Atoi(port.Value) -} - -func (wg *WorkerGenerator) AllocPort() int { - min := 30000 - max := 65535 - return rand.Intn(max-min+1) + min + return strconv.Atoi(portEnv.Value) } func (wg *WorkerGenerator) PodTemplateHash(workloadSpec any) (string, error) { @@ -61,6 +58,7 @@ func (wg *WorkerGenerator) GenerateWorkerPod( generateName string, namespace string, port int, + requests tfv1.Resource, limits tfv1.Resource, podTemplateHash string, ) (*corev1.Pod, string, error) { @@ -146,11 +144,32 @@ func (wg *WorkerGenerator) GenerateWorkerPod( }, }, }) - + workerLabels := map[string]string{ + constants.LabelComponent: constants.ComponentWorker, + } + if podTmpl.Template.Labels != nil { + for k, v := range podTmpl.Template.Labels { + workerLabels[k] = v + } + } + workerAnnotations := map[string]string{ + constants.TFLOPSRequestAnnotation: requests.Tflops.String(), + constants.TFLOPSLimitAnnotation: limits.Tflops.String(), + constants.VRAMRequestAnnotation: requests.Vram.String(), + constants.VRAMLimitAnnotation: limits.Vram.String(), + constants.GenPortNumberAnnotation: strconv.Itoa(port), + } + if podTmpl.Template.Annotations != nil { + for k, v := range podTmpl.Template.Annotations { + workerAnnotations[k] = v + } + } return &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ GenerateName: generateName, Namespace: namespace, + Labels: workerLabels, + Annotations: workerAnnotations, }, Spec: spec, }, podTemplateHash, nil @@ -163,7 +182,7 @@ func SelectWorker( maxSkew int32, ) (*tfv1.WorkerStatus, error) { if len(workload.Status.WorkerStatuses) == 0 { - return nil, fmt.Errorf("no available worker") + return nil, ErrNoAvailableWorker } usageMapping := lo.SliceToMap(workload.Status.WorkerStatuses, func(status tfv1.WorkerStatus) (string, int) {