Skip to content

Commit e4fab68

Browse files
authored
fix: add log collection config and anti affinity for operator, assign port for worker/lab-pod (#208)
* fix: add log collection config and anti affinity for operator, assign component label for log grouping * fix: port allocator issues * fix: hypervisor permission issue; assign port from leader * fix: metrics and port allocator bugs * fix: typo * fix: optimize tests * fix: move global mutex of portallocator to struct * fix: multiple gpu metrics * fix: merge conflict * chore: merge code unit test issues * fix: unit test issues, workload count bug * fix: gpu sync time in testing mode * fix: gpu allocator test case bug
1 parent 5cbfd14 commit e4fab68

File tree

70 files changed

+2407
-639
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

70 files changed

+2407
-639
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ Dockerfile.cross
1212

1313
# Output of the go coverage tool, specifically when used with LiteIDE
1414
*.out
15+
cover.out.*
1516

1617
# Go workspace file
1718
go.work

.vscode/launch.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,10 @@
5555
"type": "go",
5656
"request": "launch",
5757
"mode": "test",
58-
"program": "${workspaceFolder}",
58+
"env": {
59+
"GO_TESTING": "true"
60+
},
61+
"program": "${workspaceFolder}/internal/controller",
5962
"console": "integratedTerminal"
6063
}
6164
]

.vscode/settings.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,15 @@
11
{
22
"cSpell.words": [
3+
"alertmanager",
34
"alicloud",
45
"Aliyun",
56
"AMDCDNA",
67
"AMDRDNA",
78
"apimachinery",
9+
"automount",
810
"AWSGPU",
911
"batchv",
12+
"burstable",
1013
"CDNA",
1114
"certificaterequests",
1215
"certmanager",
@@ -39,6 +42,7 @@
3942
"greptime",
4043
"greptimedb",
4144
"healthz",
45+
"iface",
4246
"karpenter",
4347
"kubebuilder",
4448
"KUBECONFIG",
@@ -51,6 +55,7 @@
5155
"NVML",
5256
"omitempty",
5357
"onsi",
58+
"portallocator",
5459
"printcolumn",
5560
"prometheusagents",
5661
"prometheuses",
@@ -62,11 +67,13 @@
6267
"schedulingconfigtemplates",
6368
"schedulingcorev",
6469
"shirou",
70+
"strategicpatches",
6571
"subresource",
6672
"tensorfusion",
6773
"tensorfusionaiv",
6874
"tensorfusioncluster",
6975
"tensorfusionclusters",
76+
"tensorfusionworkload",
7077
"Tera",
7178
"tflops",
7279
"Tmpl",

Makefile

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -62,13 +62,8 @@ vet: ## Run go vet against code.
6262

6363
.PHONY: test
6464
test: manifests generate fmt vet envtest ## Run tests.
65-
KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $$(go list ./... | grep -v /e2e) -timeout 0 -coverprofile cover.out
65+
KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" GO_TESTING=true go run github.com/onsi/ginkgo/v2/ginkgo -p -timeout 0 -cover -coverprofile cover.out -r --skip-file ./test/e2e
6666

67-
# TODO(user): To use a different vendor for e2e tests, modify the setup under 'tests/e2e'.
68-
# The default setup assumes Kind is pre-installed and builds/loads the Manager Docker image locally.
69-
# Prometheus and CertManager are installed by default; skip with:
70-
# - PROMETHEUS_INSTALL_SKIP=true
71-
# - CERT_MANAGER_INSTALL_SKIP=true
7267
.PHONY: test-e2e
7368
test-e2e: manifests generate fmt vet ## Run the e2e tests. Expected an isolated environment using Kind.
7469
@command -v kind >/dev/null 2>&1 || { \

api/v1/gpu_types.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,18 @@ type GPUStatus struct {
3636
GPUModel string `json:"gpuModel"`
3737

3838
Message string `json:"message"`
39+
40+
// +optional
41+
RunningApps []*RunningAppDetail `json:"runningApps,omitempty"`
42+
}
43+
44+
type RunningAppDetail struct {
45+
// Workload name namespace
46+
Name string `json:"name,omitempty"`
47+
Namespace string `json:"namespace,omitempty"`
48+
49+
// Worker count
50+
Count int `json:"count"`
3951
}
4052

4153
// +kubebuilder:validation:Enum=Pending;Provisioning;Running;Unknown;Destroying;Migrating

api/v1/gpunode_funcs.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ func (node *GPUNode) InitializeStatus(initTFlops, initVRAM resource.Quantity, in
1212
TotalTFlops: initTFlops,
1313
TotalVRAM: initVRAM,
1414
TotalGPUs: initGPUs,
15-
AllocationDetails: &[]GPUNodeAllocationDetails{},
15+
AllocationInfo: []*RunningAppDetail{},
1616
LoadedModels: &[]string{},
1717
ManagedGPUDeviceIDs: []string{},
1818
ObservedGeneration: node.Generation,

api/v1/gpunode_types.go

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -94,20 +94,8 @@ type GPUNodeStatus struct {
9494

9595
ObservedGeneration int64 `json:"observedGeneration,omitempty"`
9696

97-
// Allocation details is for node compaction, and calculate used apps
9897
// +optional
99-
AllocationDetails *[]GPUNodeAllocationDetails `json:"allocationDetails,omitempty"`
100-
}
101-
102-
type GPUNodeAllocationDetails struct {
103-
PodID string `json:"podID,omitempty"`
104-
PodName string `json:"podName,omitempty"`
105-
Namespace string `json:"namespace"`
106-
WorkloadName string `json:"workload,omitempty"`
107-
108-
Requests GPUResourceUnit `json:"requests"`
109-
Limits GPUResourceUnit `json:"limits"`
110-
QoS QoSLevel `json:"qos,omitempty"`
98+
AllocationInfo []*RunningAppDetail `json:"allocationInfo,omitempty"`
11199
}
112100

113101
// +kubebuilder:validation:Enum=Pending;Provisioning;Migrating;Running;Succeeded;Failed;Unknown;Destroying

api/v1/gpupool_types.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -293,7 +293,7 @@ type QosPricing struct {
293293

294294
Requests GPUResourcePricingUnit `json:"requests,omitempty"`
295295

296-
// Default requests and limitsOverRequests are same, indicates normal on-demand serverless GPU usage, in hands-on lab low QoS case, limitsOverRequests should be cheaper, for example Low QoS, ratio should be 0.5
296+
// Default requests and limitsOverRequests are same, indicates normal on-demand serverless GPU usage, in hands-on lab low QoS case, limitsOverRequests should be lower, so that user can get burstable GPU resources with very low cost
297297
// +kubebuilder:default="1"
298298
LimitsOverRequestsChargingRatio string `json:"limitsOverRequests,omitempty"`
299299
}
@@ -372,6 +372,8 @@ type GPUPoolStatus struct {
372372
AvailableTFlops resource.Quantity `json:"availableTFlops"`
373373
AvailableVRAM resource.Quantity `json:"availableVRAM"`
374374

375+
RunningAppsCnt int32 `json:"runningAppsCnt,omitempty"`
376+
375377
// +optional
376378
VirtualAvailableTFlops *resource.Quantity `json:"virtualAvailableTFlops,omitempty"`
377379
// +optional

api/v1/zz_generated.deepcopy.go

Lines changed: 34 additions & 25 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

charts/tensor-fusion/Chart.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@ type: application
1515
# This is the chart version. This version number should be incremented each time you make changes
1616
# to the chart and its templates, including the app version.
1717
# Versions are expected to follow Semantic Versioning (https://semver.org/)
18-
version: 1.2.22
18+
version: 1.3.2
1919

2020
# This is the version number of the application being deployed. This version number should be
2121
# incremented each time you make changes to the application. Versions are not expected to
2222
# follow Semantic Versioning. They should reflect the version the application is using.
2323
# It is recommended to use it with quotes.
24-
appVersion: "1.12.1"
24+
appVersion: "1.30.3"

charts/tensor-fusion/crds/tensor-fusion.ai_gpunodes.yaml

Lines changed: 8 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -86,66 +86,19 @@ spec:
8686
status:
8787
description: GPUNodeStatus defines the observed state of GPUNode.
8888
properties:
89-
allocationDetails:
90-
description: Allocation details is for node compaction, and calculate
91-
used apps
89+
allocationInfo:
9290
items:
9391
properties:
94-
limits:
95-
properties:
96-
tflops:
97-
anyOf:
98-
- type: integer
99-
- type: string
100-
description: Tera floating point operations per second
101-
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
102-
x-kubernetes-int-or-string: true
103-
vram:
104-
anyOf:
105-
- type: integer
106-
- type: string
107-
description: VRAM is short for Video memory, namely GPU
108-
RAM
109-
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
110-
x-kubernetes-int-or-string: true
111-
type: object
112-
namespace:
113-
type: string
114-
podID:
115-
type: string
116-
podName:
117-
type: string
118-
qos:
119-
enum:
120-
- low
121-
- medium
122-
- high
123-
- critical
92+
count:
93+
description: Worker count
94+
type: integer
95+
name:
96+
description: Workload name namespace
12497
type: string
125-
requests:
126-
properties:
127-
tflops:
128-
anyOf:
129-
- type: integer
130-
- type: string
131-
description: Tera floating point operations per second
132-
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
133-
x-kubernetes-int-or-string: true
134-
vram:
135-
anyOf:
136-
- type: integer
137-
- type: string
138-
description: VRAM is short for Video memory, namely GPU
139-
RAM
140-
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
141-
x-kubernetes-int-or-string: true
142-
type: object
143-
workload:
98+
namespace:
14499
type: string
145100
required:
146-
- limits
147-
- namespace
148-
- requests
101+
- count
149102
type: object
150103
type: array
151104
availableTFlops:

charts/tensor-fusion/crds/tensor-fusion.ai_gpupools.yaml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -546,7 +546,8 @@ spec:
546546
description: Default requests and limitsOverRequests are
547547
same, indicates normal on-demand serverless GPU usage,
548548
in hands-on lab low QoS case, limitsOverRequests should
549-
be cheaper, for example Low QoS, ratio should be 0.5
549+
be lower, so that user can get burstable GPU resources
550+
with very low cost
550551
type: string
551552
qos:
552553
enum:
@@ -704,6 +705,9 @@ spec:
704705
readyNodes:
705706
format: int32
706707
type: integer
708+
runningAppsCnt:
709+
format: int32
710+
type: integer
707711
savedCostsPerMonth:
708712
type: string
709713
totalGPUs:

charts/tensor-fusion/crds/tensor-fusion.ai_gpus.yaml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,21 @@ spec:
116116
- Destroying
117117
- Migrating
118118
type: string
119+
runningApps:
120+
items:
121+
properties:
122+
count:
123+
description: Worker count
124+
type: integer
125+
name:
126+
description: Workload name namespace
127+
type: string
128+
namespace:
129+
type: string
130+
required:
131+
- count
132+
type: object
133+
type: array
119134
uuid:
120135
type: string
121136
required:

charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionclusters.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -650,8 +650,8 @@ spec:
650650
description: Default requests and limitsOverRequests
651651
are same, indicates normal on-demand serverless
652652
GPU usage, in hands-on lab low QoS case, limitsOverRequests
653-
should be cheaper, for example Low QoS, ratio
654-
should be 0.5
653+
should be lower, so that user can get burstable
654+
GPU resources with very low cost
655655
type: string
656656
qos:
657657
enum:

charts/tensor-fusion/templates/admission-webhooks/mutating-webhook.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ webhooks:
2222
resources:
2323
- pods
2424
sideEffects: None
25+
timeoutSeconds: 30
2526
objectSelector:
2627
matchExpressions:
2728
- key: tensor-fusion.ai/enabled

0 commit comments

Comments
 (0)