Skip to content

fix: add log collection config and anti affinity for operator, assign port for worker/lab-pod #208

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Jun 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ Dockerfile.cross

# Output of the go coverage tool, specifically when used with LiteIDE
*.out
cover.out.*

# Go workspace file
go.work
Expand Down
5 changes: 4 additions & 1 deletion .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,10 @@
"type": "go",
"request": "launch",
"mode": "test",
"program": "${workspaceFolder}",
"env": {
"GO_TESTING": "true"
},
"program": "${workspaceFolder}/internal/controller",
"console": "integratedTerminal"
}
]
Expand Down
7 changes: 7 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
{
"cSpell.words": [
"alertmanager",
"alicloud",
"Aliyun",
"AMDCDNA",
"AMDRDNA",
"apimachinery",
"automount",
"AWSGPU",
"batchv",
"burstable",
"CDNA",
"certificaterequests",
"certmanager",
Expand Down Expand Up @@ -39,6 +42,7 @@
"greptime",
"greptimedb",
"healthz",
"iface",
"karpenter",
"kubebuilder",
"KUBECONFIG",
Expand All @@ -51,6 +55,7 @@
"NVML",
"omitempty",
"onsi",
"portallocator",
"printcolumn",
"prometheusagents",
"prometheuses",
Expand All @@ -62,11 +67,13 @@
"schedulingconfigtemplates",
"schedulingcorev",
"shirou",
"strategicpatches",
"subresource",
"tensorfusion",
"tensorfusionaiv",
"tensorfusioncluster",
"tensorfusionclusters",
"tensorfusionworkload",
"Tera",
"tflops",
"Tmpl",
Expand Down
7 changes: 1 addition & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,8 @@ vet: ## Run go vet against code.

.PHONY: test
test: manifests generate fmt vet envtest ## Run tests.
KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $$(go list ./... | grep -v /e2e) -timeout 0 -coverprofile cover.out
KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" GO_TESTING=true go run github.com/onsi/ginkgo/v2/ginkgo -p -timeout 0 -cover -coverprofile cover.out -r --skip-file ./test/e2e

# TODO(user): To use a different vendor for e2e tests, modify the setup under 'tests/e2e'.
# The default setup assumes Kind is pre-installed and builds/loads the Manager Docker image locally.
# Prometheus and CertManager are installed by default; skip with:
# - PROMETHEUS_INSTALL_SKIP=true
# - CERT_MANAGER_INSTALL_SKIP=true
.PHONY: test-e2e
test-e2e: manifests generate fmt vet ## Run the e2e tests. Expected an isolated environment using Kind.
@command -v kind >/dev/null 2>&1 || { \
Expand Down
12 changes: 12 additions & 0 deletions api/v1/gpu_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,18 @@ type GPUStatus struct {
GPUModel string `json:"gpuModel"`

Message string `json:"message"`

// +optional
RunningApps []*RunningAppDetail `json:"runningApps,omitempty"`
}

type RunningAppDetail struct {
// Workload name namespace
Name string `json:"name,omitempty"`
Namespace string `json:"namespace,omitempty"`

// Worker count
Count int `json:"count"`
}

// +kubebuilder:validation:Enum=Pending;Provisioning;Running;Unknown;Destroying;Migrating
Expand Down
2 changes: 1 addition & 1 deletion api/v1/gpunode_funcs.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ func (node *GPUNode) InitializeStatus(initTFlops, initVRAM resource.Quantity, in
TotalTFlops: initTFlops,
TotalVRAM: initVRAM,
TotalGPUs: initGPUs,
AllocationDetails: &[]GPUNodeAllocationDetails{},
AllocationInfo: []*RunningAppDetail{},
LoadedModels: &[]string{},
ManagedGPUDeviceIDs: []string{},
ObservedGeneration: node.Generation,
Expand Down
14 changes: 1 addition & 13 deletions api/v1/gpunode_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,20 +94,8 @@ type GPUNodeStatus struct {

ObservedGeneration int64 `json:"observedGeneration,omitempty"`

// Allocation details is for node compaction, and calculate used apps
// +optional
AllocationDetails *[]GPUNodeAllocationDetails `json:"allocationDetails,omitempty"`
}

type GPUNodeAllocationDetails struct {
PodID string `json:"podID,omitempty"`
PodName string `json:"podName,omitempty"`
Namespace string `json:"namespace"`
WorkloadName string `json:"workload,omitempty"`

Requests GPUResourceUnit `json:"requests"`
Limits GPUResourceUnit `json:"limits"`
QoS QoSLevel `json:"qos,omitempty"`
AllocationInfo []*RunningAppDetail `json:"allocationInfo,omitempty"`
}

// +kubebuilder:validation:Enum=Pending;Provisioning;Migrating;Running;Succeeded;Failed;Unknown;Destroying
Expand Down
4 changes: 3 additions & 1 deletion api/v1/gpupool_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ type QosPricing struct {

Requests GPUResourcePricingUnit `json:"requests,omitempty"`

// Default requests and limitsOverRequests are same, indicates normal on-demand serverless GPU usage, in hands-on lab low QoS case, limitsOverRequests should be cheaper, for example Low QoS, ratio should be 0.5
// Default requests and limitsOverRequests are same, indicates normal on-demand serverless GPU usage, in hands-on lab low QoS case, limitsOverRequests should be lower, so that user can get burstable GPU resources with very low cost
// +kubebuilder:default="1"
LimitsOverRequestsChargingRatio string `json:"limitsOverRequests,omitempty"`
}
Expand Down Expand Up @@ -372,6 +372,8 @@ type GPUPoolStatus struct {
AvailableTFlops resource.Quantity `json:"availableTFlops"`
AvailableVRAM resource.Quantity `json:"availableVRAM"`

RunningAppsCnt int32 `json:"runningAppsCnt,omitempty"`

// +optional
VirtualAvailableTFlops *resource.Quantity `json:"virtualAvailableTFlops,omitempty"`
// +optional
Expand Down
59 changes: 34 additions & 25 deletions api/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions charts/tensor-fusion/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 1.2.22
version: 1.3.2

# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: "1.12.1"
appVersion: "1.30.3"
63 changes: 8 additions & 55 deletions charts/tensor-fusion/crds/tensor-fusion.ai_gpunodes.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -86,66 +86,19 @@ spec:
status:
description: GPUNodeStatus defines the observed state of GPUNode.
properties:
allocationDetails:
description: Allocation details is for node compaction, and calculate
used apps
allocationInfo:
items:
properties:
limits:
properties:
tflops:
anyOf:
- type: integer
- type: string
description: Tera floating point operations per second
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
vram:
anyOf:
- type: integer
- type: string
description: VRAM is short for Video memory, namely GPU
RAM
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
type: object
namespace:
type: string
podID:
type: string
podName:
type: string
qos:
enum:
- low
- medium
- high
- critical
count:
description: Worker count
type: integer
name:
description: Workload name namespace
type: string
requests:
properties:
tflops:
anyOf:
- type: integer
- type: string
description: Tera floating point operations per second
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
vram:
anyOf:
- type: integer
- type: string
description: VRAM is short for Video memory, namely GPU
RAM
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
type: object
workload:
namespace:
type: string
required:
- limits
- namespace
- requests
- count
type: object
type: array
availableTFlops:
Expand Down
6 changes: 5 additions & 1 deletion charts/tensor-fusion/crds/tensor-fusion.ai_gpupools.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -546,7 +546,8 @@ spec:
description: Default requests and limitsOverRequests are
same, indicates normal on-demand serverless GPU usage,
in hands-on lab low QoS case, limitsOverRequests should
be cheaper, for example Low QoS, ratio should be 0.5
be lower, so that user can get burstable GPU resources
with very low cost
type: string
qos:
enum:
Expand Down Expand Up @@ -704,6 +705,9 @@ spec:
readyNodes:
format: int32
type: integer
runningAppsCnt:
format: int32
type: integer
savedCostsPerMonth:
type: string
totalGPUs:
Expand Down
15 changes: 15 additions & 0 deletions charts/tensor-fusion/crds/tensor-fusion.ai_gpus.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,21 @@ spec:
- Destroying
- Migrating
type: string
runningApps:
items:
properties:
count:
description: Worker count
type: integer
name:
description: Workload name namespace
type: string
namespace:
type: string
required:
- count
type: object
type: array
uuid:
type: string
required:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -650,8 +650,8 @@ spec:
description: Default requests and limitsOverRequests
are same, indicates normal on-demand serverless
GPU usage, in hands-on lab low QoS case, limitsOverRequests
should be cheaper, for example Low QoS, ratio
should be 0.5
should be lower, so that user can get burstable
GPU resources with very low cost
type: string
qos:
enum:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ webhooks:
resources:
- pods
sideEffects: None
timeoutSeconds: 30
objectSelector:
matchExpressions:
- key: tensor-fusion.ai/enabled
Expand Down
Loading