From 492f074127f72b330f98d9af193e0ce5cc3ca2b2 Mon Sep 17 00:00:00 2001 From: 0x5457 <0x5457@protonmail.com> Date: Tue, 3 Dec 2024 05:43:57 +0000 Subject: [PATCH 01/22] chore: kubebuilder init --- .devcontainer/devcontainer.json | 25 ++ .devcontainer/post-install.sh | 23 ++ .dockerignore | 3 + .github/workflows/lint.yml | 23 ++ .github/workflows/test-e2e.yml | 35 ++ .github/workflows/test.yml | 23 ++ .gitignore | 27 ++ .golangci.yml | 47 +++ Dockerfile | 33 ++ Makefile | 212 ++++++++++++ PROJECT | 10 + cmd/main.go | 157 +++++++++ config/default/kustomization.yaml | 177 ++++++++++ config/default/manager_metrics_patch.yaml | 4 + config/default/metrics_service.yaml | 17 + config/manager/kustomization.yaml | 2 + config/manager/manager.yaml | 95 ++++++ .../network-policy/allow-metrics-traffic.yaml | 26 ++ config/network-policy/kustomization.yaml | 2 + config/prometheus/kustomization.yaml | 2 + config/prometheus/monitor.yaml | 30 ++ config/rbac/kustomization.yaml | 20 ++ config/rbac/leader_election_role.yaml | 40 +++ config/rbac/leader_election_role_binding.yaml | 15 + config/rbac/metrics_auth_role.yaml | 17 + config/rbac/metrics_auth_role_binding.yaml | 12 + config/rbac/metrics_reader_role.yaml | 9 + config/rbac/role.yaml | 11 + config/rbac/role_binding.yaml | 15 + config/rbac/service_account.yaml | 8 + go.mod | 98 ++++++ go.sum | 251 ++++++++++++++ hack/boilerplate.go.txt | 15 + test/e2e/e2e_suite_test.go | 120 +++++++ test/e2e/e2e_test.go | 307 ++++++++++++++++++ test/utils/utils.go | 251 ++++++++++++++ 36 files changed, 2162 insertions(+) create mode 100644 .devcontainer/devcontainer.json create mode 100644 .devcontainer/post-install.sh create mode 100644 .dockerignore create mode 100644 .github/workflows/lint.yml create mode 100644 .github/workflows/test-e2e.yml create mode 100644 .github/workflows/test.yml create mode 100644 .gitignore create mode 100644 .golangci.yml create mode 100644 Dockerfile create mode 100644 Makefile create mode 100644 PROJECT create mode 100644 cmd/main.go create mode 100644 config/default/kustomization.yaml create mode 100644 config/default/manager_metrics_patch.yaml create mode 100644 config/default/metrics_service.yaml create mode 100644 config/manager/kustomization.yaml create mode 100644 config/manager/manager.yaml create mode 100644 config/network-policy/allow-metrics-traffic.yaml create mode 100644 config/network-policy/kustomization.yaml create mode 100644 config/prometheus/kustomization.yaml create mode 100644 config/prometheus/monitor.yaml create mode 100644 config/rbac/kustomization.yaml create mode 100644 config/rbac/leader_election_role.yaml create mode 100644 config/rbac/leader_election_role_binding.yaml create mode 100644 config/rbac/metrics_auth_role.yaml create mode 100644 config/rbac/metrics_auth_role_binding.yaml create mode 100644 config/rbac/metrics_reader_role.yaml create mode 100644 config/rbac/role.yaml create mode 100644 config/rbac/role_binding.yaml create mode 100644 config/rbac/service_account.yaml create mode 100644 go.mod create mode 100644 go.sum create mode 100644 hack/boilerplate.go.txt create mode 100644 test/e2e/e2e_suite_test.go create mode 100644 test/e2e/e2e_test.go create mode 100644 test/utils/utils.go diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..e2cdc09 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,25 @@ +{ + "name": "Kubebuilder DevContainer", + "image": "golang:1.22", + "features": { + "ghcr.io/devcontainers/features/docker-in-docker:2": {}, + "ghcr.io/devcontainers/features/git:1": {} + }, + + "runArgs": ["--network=host"], + + "customizations": { + "vscode": { + "settings": { + "terminal.integrated.shell.linux": "/bin/bash" + }, + "extensions": [ + "ms-kubernetes-tools.vscode-kubernetes-tools", + "ms-azuretools.vscode-docker" + ] + } + }, + + "onCreateCommand": "bash .devcontainer/post-install.sh" +} + diff --git a/.devcontainer/post-install.sh b/.devcontainer/post-install.sh new file mode 100644 index 0000000..265c43e --- /dev/null +++ b/.devcontainer/post-install.sh @@ -0,0 +1,23 @@ +#!/bin/bash +set -x + +curl -Lo ./kind https://kind.sigs.k8s.io/dl/latest/kind-linux-amd64 +chmod +x ./kind +mv ./kind /usr/local/bin/kind + +curl -L -o kubebuilder https://go.kubebuilder.io/dl/latest/linux/amd64 +chmod +x kubebuilder +mv kubebuilder /usr/local/bin/ + +KUBECTL_VERSION=$(curl -L -s https://dl.k8s.io/release/stable.txt) +curl -LO "https://dl.k8s.io/release/$KUBECTL_VERSION/bin/linux/amd64/kubectl" +chmod +x kubectl +mv kubectl /usr/local/bin/kubectl + +docker network create -d=bridge --subnet=172.19.0.0/24 kind + +kind version +kubebuilder version +docker --version +go version +kubectl version --client diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..a3aab7a --- /dev/null +++ b/.dockerignore @@ -0,0 +1,3 @@ +# More info: https://docs.docker.com/engine/reference/builder/#dockerignore-file +# Ignore build and test binaries. +bin/ diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000..f40d365 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,23 @@ +name: Lint + +on: + push: + pull_request: + +jobs: + lint: + name: Run on Ubuntu + runs-on: ubuntu-latest + steps: + - name: Clone the code + uses: actions/checkout@v4 + + - name: Setup Go + uses: actions/setup-go@v5 + with: + go-version: '~1.22' + + - name: Run linter + uses: golangci/golangci-lint-action@v6 + with: + version: v1.61 diff --git a/.github/workflows/test-e2e.yml b/.github/workflows/test-e2e.yml new file mode 100644 index 0000000..8780644 --- /dev/null +++ b/.github/workflows/test-e2e.yml @@ -0,0 +1,35 @@ +name: E2E Tests + +on: + push: + pull_request: + +jobs: + test-e2e: + name: Run on Ubuntu + runs-on: ubuntu-latest + steps: + - name: Clone the code + uses: actions/checkout@v4 + + - name: Setup Go + uses: actions/setup-go@v5 + with: + go-version: '~1.22' + + - name: Install the latest version of kind + run: | + curl -Lo ./kind https://kind.sigs.k8s.io/dl/latest/kind-linux-amd64 + chmod +x ./kind + sudo mv ./kind /usr/local/bin/kind + + - name: Verify kind installation + run: kind version + + - name: Create kind cluster + run: kind create cluster + + - name: Running Test e2e + run: | + go mod tidy + make test-e2e diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..7baf657 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,23 @@ +name: Tests + +on: + push: + pull_request: + +jobs: + test: + name: Run on Ubuntu + runs-on: ubuntu-latest + steps: + - name: Clone the code + uses: actions/checkout@v4 + + - name: Setup Go + uses: actions/setup-go@v5 + with: + go-version: '~1.22' + + - name: Running Tests + run: | + go mod tidy + make test diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ada68ff --- /dev/null +++ b/.gitignore @@ -0,0 +1,27 @@ +# Binaries for programs and plugins +*.exe +*.exe~ +*.dll +*.so +*.dylib +bin/* +Dockerfile.cross + +# Test binary, built with `go test -c` +*.test + +# Output of the go coverage tool, specifically when used with LiteIDE +*.out + +# Go workspace file +go.work + +# Kubernetes Generated files - skip generated files, except for vendored files +!vendor/**/zz_generated.* + +# editor and IDE paraphernalia +.idea +.vscode +*.swp +*.swo +*~ diff --git a/.golangci.yml b/.golangci.yml new file mode 100644 index 0000000..6b29746 --- /dev/null +++ b/.golangci.yml @@ -0,0 +1,47 @@ +run: + timeout: 5m + allow-parallel-runners: true + +issues: + # don't skip warning about doc comments + # don't exclude the default set of lint + exclude-use-default: false + # restore some of the defaults + # (fill in the rest as needed) + exclude-rules: + - path: "api/*" + linters: + - lll + - path: "internal/*" + linters: + - dupl + - lll +linters: + disable-all: true + enable: + - dupl + - errcheck + - copyloopvar + - ginkgolinter + - goconst + - gocyclo + - gofmt + - goimports + - gosimple + - govet + - ineffassign + - lll + - misspell + - nakedret + - prealloc + - revive + - staticcheck + - typecheck + - unconvert + - unparam + - unused + +linters-settings: + revive: + rules: + - name: comment-spacings diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..4ba18b6 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,33 @@ +# Build the manager binary +FROM golang:1.22 AS builder +ARG TARGETOS +ARG TARGETARCH + +WORKDIR /workspace +# Copy the Go Modules manifests +COPY go.mod go.mod +COPY go.sum go.sum +# cache deps before building and copying source so that we don't need to re-download as much +# and so that source changes don't invalidate our downloaded layer +RUN go mod download + +# Copy the go source +COPY cmd/main.go cmd/main.go +COPY api/ api/ +COPY internal/ internal/ + +# Build +# the GOARCH has not a default value to allow the binary be built according to the host where the command +# was called. For example, if we call make docker-build in a local env which has the Apple Silicon M1 SO +# the docker BUILDPLATFORM arg will be linux/arm64 when for Apple x86 it will be linux/amd64. Therefore, +# by leaving it empty we can ensure that the container and binary shipped on it will have the same platform. +RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} go build -a -o manager cmd/main.go + +# Use distroless as minimal base image to package the manager binary +# Refer to https://github.com/GoogleContainerTools/distroless for more details +FROM gcr.io/distroless/static:nonroot +WORKDIR / +COPY --from=builder /workspace/manager . +USER 65532:65532 + +ENTRYPOINT ["/manager"] diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..a95fe02 --- /dev/null +++ b/Makefile @@ -0,0 +1,212 @@ +# Image URL to use all building/pushing image targets +IMG ?= controller:latest +# ENVTEST_K8S_VERSION refers to the version of kubebuilder assets to be downloaded by envtest binary. +ENVTEST_K8S_VERSION = 1.31.0 + +# Get the currently used golang install path (in GOPATH/bin, unless GOBIN is set) +ifeq (,$(shell go env GOBIN)) +GOBIN=$(shell go env GOPATH)/bin +else +GOBIN=$(shell go env GOBIN) +endif + +# CONTAINER_TOOL defines the container tool to be used for building images. +# Be aware that the target commands are only tested with Docker which is +# scaffolded by default. However, you might want to replace it to use other +# tools. (i.e. podman) +CONTAINER_TOOL ?= docker + +# Setting SHELL to bash allows bash commands to be executed by recipes. +# Options are set to exit when a recipe line exits non-zero or a piped command fails. +SHELL = /usr/bin/env bash -o pipefail +.SHELLFLAGS = -ec + +.PHONY: all +all: build + +##@ General + +# The help target prints out all targets with their descriptions organized +# beneath their categories. The categories are represented by '##@' and the +# target descriptions by '##'. The awk command is responsible for reading the +# entire set of makefiles included in this invocation, looking for lines of the +# file as xyz: ## something, and then pretty-format the target and help. Then, +# if there's a line with ##@ something, that gets pretty-printed as a category. +# More info on the usage of ANSI control characters for terminal formatting: +# https://en.wikipedia.org/wiki/ANSI_escape_code#SGR_parameters +# More info on the awk command: +# http://linuxcommand.org/lc3_adv_awk.php + +.PHONY: help +help: ## Display this help. + @awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf " \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST) + +##@ Development + +.PHONY: manifests +manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects. + $(CONTROLLER_GEN) rbac:roleName=manager-role crd webhook paths="./..." output:crd:artifacts:config=config/crd/bases + +.PHONY: generate +generate: controller-gen ## Generate code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations. + $(CONTROLLER_GEN) object:headerFile="hack/boilerplate.go.txt" paths="./..." + +.PHONY: fmt +fmt: ## Run go fmt against code. + go fmt ./... + +.PHONY: vet +vet: ## Run go vet against code. + go vet ./... + +.PHONY: test +test: manifests generate fmt vet envtest ## Run tests. + KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $$(go list ./... | grep -v /e2e) -coverprofile cover.out + +# TODO(user): To use a different vendor for e2e tests, modify the setup under 'tests/e2e'. +# The default setup assumes Kind is pre-installed and builds/loads the Manager Docker image locally. +# Prometheus and CertManager are installed by default; skip with: +# - PROMETHEUS_INSTALL_SKIP=true +# - CERT_MANAGER_INSTALL_SKIP=true +.PHONY: test-e2e +test-e2e: manifests generate fmt vet ## Run the e2e tests. Expected an isolated environment using Kind. + @command -v kind >/dev/null 2>&1 || { \ + echo "Kind is not installed. Please install Kind manually."; \ + exit 1; \ + } + @kind get clusters | grep -q 'kind' || { \ + echo "No Kind cluster is running. Please start a Kind cluster before running the e2e tests."; \ + exit 1; \ + } + go test ./test/e2e/ -v -ginkgo.v + +.PHONY: lint +lint: golangci-lint ## Run golangci-lint linter + $(GOLANGCI_LINT) run + +.PHONY: lint-fix +lint-fix: golangci-lint ## Run golangci-lint linter and perform fixes + $(GOLANGCI_LINT) run --fix + +##@ Build + +.PHONY: build +build: manifests generate fmt vet ## Build manager binary. + go build -o bin/manager cmd/main.go + +.PHONY: run +run: manifests generate fmt vet ## Run a controller from your host. + go run ./cmd/main.go + +# If you wish to build the manager image targeting other platforms you can use the --platform flag. +# (i.e. docker build --platform linux/arm64). However, you must enable docker buildKit for it. +# More info: https://docs.docker.com/develop/develop-images/build_enhancements/ +.PHONY: docker-build +docker-build: ## Build docker image with the manager. + $(CONTAINER_TOOL) build -t ${IMG} . + +.PHONY: docker-push +docker-push: ## Push docker image with the manager. + $(CONTAINER_TOOL) push ${IMG} + +# PLATFORMS defines the target platforms for the manager image be built to provide support to multiple +# architectures. (i.e. make docker-buildx IMG=myregistry/mypoperator:0.0.1). To use this option you need to: +# - be able to use docker buildx. More info: https://docs.docker.com/build/buildx/ +# - have enabled BuildKit. More info: https://docs.docker.com/develop/develop-images/build_enhancements/ +# - be able to push the image to your registry (i.e. if you do not set a valid value via IMG=> then the export will fail) +# To adequately provide solutions that are compatible with multiple platforms, you should consider using this option. +PLATFORMS ?= linux/arm64,linux/amd64,linux/s390x,linux/ppc64le +.PHONY: docker-buildx +docker-buildx: ## Build and push docker image for the manager for cross-platform support + # copy existing Dockerfile and insert --platform=${BUILDPLATFORM} into Dockerfile.cross, and preserve the original Dockerfile + sed -e '1 s/\(^FROM\)/FROM --platform=\$$\{BUILDPLATFORM\}/; t' -e ' 1,// s//FROM --platform=\$$\{BUILDPLATFORM\}/' Dockerfile > Dockerfile.cross + - $(CONTAINER_TOOL) buildx create --name tensor-fusion-operator-builder + $(CONTAINER_TOOL) buildx use tensor-fusion-operator-builder + - $(CONTAINER_TOOL) buildx build --push --platform=$(PLATFORMS) --tag ${IMG} -f Dockerfile.cross . + - $(CONTAINER_TOOL) buildx rm tensor-fusion-operator-builder + rm Dockerfile.cross + +.PHONY: build-installer +build-installer: manifests generate kustomize ## Generate a consolidated YAML with CRDs and deployment. + mkdir -p dist + cd config/manager && $(KUSTOMIZE) edit set image controller=${IMG} + $(KUSTOMIZE) build config/default > dist/install.yaml + +##@ Deployment + +ifndef ignore-not-found + ignore-not-found = false +endif + +.PHONY: install +install: manifests kustomize ## Install CRDs into the K8s cluster specified in ~/.kube/config. + $(KUSTOMIZE) build config/crd | $(KUBECTL) apply -f - + +.PHONY: uninstall +uninstall: manifests kustomize ## Uninstall CRDs from the K8s cluster specified in ~/.kube/config. Call with ignore-not-found=true to ignore resource not found errors during deletion. + $(KUSTOMIZE) build config/crd | $(KUBECTL) delete --ignore-not-found=$(ignore-not-found) -f - + +.PHONY: deploy +deploy: manifests kustomize ## Deploy controller to the K8s cluster specified in ~/.kube/config. + cd config/manager && $(KUSTOMIZE) edit set image controller=${IMG} + $(KUSTOMIZE) build config/default | $(KUBECTL) apply -f - + +.PHONY: undeploy +undeploy: kustomize ## Undeploy controller from the K8s cluster specified in ~/.kube/config. Call with ignore-not-found=true to ignore resource not found errors during deletion. + $(KUSTOMIZE) build config/default | $(KUBECTL) delete --ignore-not-found=$(ignore-not-found) -f - + +##@ Dependencies + +## Location to install dependencies to +LOCALBIN ?= $(shell pwd)/bin +$(LOCALBIN): + mkdir -p $(LOCALBIN) + +## Tool Binaries +KUBECTL ?= kubectl +KUSTOMIZE ?= $(LOCALBIN)/kustomize +CONTROLLER_GEN ?= $(LOCALBIN)/controller-gen +ENVTEST ?= $(LOCALBIN)/setup-envtest +GOLANGCI_LINT = $(LOCALBIN)/golangci-lint + +## Tool Versions +KUSTOMIZE_VERSION ?= v5.5.0 +CONTROLLER_TOOLS_VERSION ?= v0.16.4 +ENVTEST_VERSION ?= release-0.19 +GOLANGCI_LINT_VERSION ?= v1.61.0 + +.PHONY: kustomize +kustomize: $(KUSTOMIZE) ## Download kustomize locally if necessary. +$(KUSTOMIZE): $(LOCALBIN) + $(call go-install-tool,$(KUSTOMIZE),sigs.k8s.io/kustomize/kustomize/v5,$(KUSTOMIZE_VERSION)) + +.PHONY: controller-gen +controller-gen: $(CONTROLLER_GEN) ## Download controller-gen locally if necessary. +$(CONTROLLER_GEN): $(LOCALBIN) + $(call go-install-tool,$(CONTROLLER_GEN),sigs.k8s.io/controller-tools/cmd/controller-gen,$(CONTROLLER_TOOLS_VERSION)) + +.PHONY: envtest +envtest: $(ENVTEST) ## Download setup-envtest locally if necessary. +$(ENVTEST): $(LOCALBIN) + $(call go-install-tool,$(ENVTEST),sigs.k8s.io/controller-runtime/tools/setup-envtest,$(ENVTEST_VERSION)) + +.PHONY: golangci-lint +golangci-lint: $(GOLANGCI_LINT) ## Download golangci-lint locally if necessary. +$(GOLANGCI_LINT): $(LOCALBIN) + $(call go-install-tool,$(GOLANGCI_LINT),github.com/golangci/golangci-lint/cmd/golangci-lint,$(GOLANGCI_LINT_VERSION)) + +# go-install-tool will 'go install' any package with custom target and name of binary, if it doesn't exist +# $1 - target path with name of binary +# $2 - package url which can be installed +# $3 - specific version of package +define go-install-tool +@[ -f "$(1)-$(3)" ] || { \ +set -e; \ +package=$(2)@$(3) ;\ +echo "Downloading $${package}" ;\ +rm -f $(1) || true ;\ +GOBIN=$(LOCALBIN) go install $${package} ;\ +mv $(1) $(1)-$(3) ;\ +} ;\ +ln -sf $(1)-$(3) $(1) +endef diff --git a/PROJECT b/PROJECT new file mode 100644 index 0000000..cb60042 --- /dev/null +++ b/PROJECT @@ -0,0 +1,10 @@ +# Code generated by tool. DO NOT EDIT. +# This file is used to track the info used to scaffold your project +# and allow the plugins properly work. +# More info: https://book.kubebuilder.io/reference/project-config.html +domain: tensor-fusion.ai +layout: +- go.kubebuilder.io/v4 +projectName: tensor-fusion-operator +repo: github.com/NexusGPU/tensor-fusion-operator +version: "3" diff --git a/cmd/main.go b/cmd/main.go new file mode 100644 index 0000000..8992f96 --- /dev/null +++ b/cmd/main.go @@ -0,0 +1,157 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "crypto/tls" + "flag" + "os" + + // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) + // to ensure that exec-entrypoint and run can make use of them. + _ "k8s.io/client-go/plugin/pkg/client/auth" + + "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/healthz" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + "sigs.k8s.io/controller-runtime/pkg/metrics/filters" + metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" + "sigs.k8s.io/controller-runtime/pkg/webhook" + // +kubebuilder:scaffold:imports +) + +var ( + scheme = runtime.NewScheme() + setupLog = ctrl.Log.WithName("setup") +) + +func init() { + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + + // +kubebuilder:scaffold:scheme +} + +func main() { + var metricsAddr string + var enableLeaderElection bool + var probeAddr string + var secureMetrics bool + var enableHTTP2 bool + var tlsOpts []func(*tls.Config) + flag.StringVar(&metricsAddr, "metrics-bind-address", "0", "The address the metrics endpoint binds to. "+ + "Use :8443 for HTTPS or :8080 for HTTP, or leave as 0 to disable the metrics service.") + flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") + flag.BoolVar(&enableLeaderElection, "leader-elect", false, + "Enable leader election for controller manager. "+ + "Enabling this will ensure there is only one active controller manager.") + flag.BoolVar(&secureMetrics, "metrics-secure", true, + "If set, the metrics endpoint is served securely via HTTPS. Use --metrics-secure=false to use HTTP instead.") + flag.BoolVar(&enableHTTP2, "enable-http2", false, + "If set, HTTP/2 will be enabled for the metrics and webhook servers") + opts := zap.Options{ + Development: true, + } + opts.BindFlags(flag.CommandLine) + flag.Parse() + + ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) + + // if the enable-http2 flag is false (the default), http/2 should be disabled + // due to its vulnerabilities. More specifically, disabling http/2 will + // prevent from being vulnerable to the HTTP/2 Stream Cancellation and + // Rapid Reset CVEs. For more information see: + // - https://github.com/advisories/GHSA-qppj-fm5r-hxr3 + // - https://github.com/advisories/GHSA-4374-p667-p6c8 + disableHTTP2 := func(c *tls.Config) { + setupLog.Info("disabling http/2") + c.NextProtos = []string{"http/1.1"} + } + + if !enableHTTP2 { + tlsOpts = append(tlsOpts, disableHTTP2) + } + + webhookServer := webhook.NewServer(webhook.Options{ + TLSOpts: tlsOpts, + }) + + // Metrics endpoint is enabled in 'config/default/kustomization.yaml'. The Metrics options configure the server. + // More info: + // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.19.1/pkg/metrics/server + // - https://book.kubebuilder.io/reference/metrics.html + metricsServerOptions := metricsserver.Options{ + BindAddress: metricsAddr, + SecureServing: secureMetrics, + TLSOpts: tlsOpts, + } + + if secureMetrics { + // FilterProvider is used to protect the metrics endpoint with authn/authz. + // These configurations ensure that only authorized users and service accounts + // can access the metrics endpoint. The RBAC are configured in 'config/rbac/kustomization.yaml'. More info: + // https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.19.1/pkg/metrics/filters#WithAuthenticationAndAuthorization + metricsServerOptions.FilterProvider = filters.WithAuthenticationAndAuthorization + + // TODO(user): If CertDir, CertName, and KeyName are not specified, controller-runtime will automatically + // generate self-signed certificates for the metrics server. While convenient for development and testing, + // this setup is not recommended for production. + } + + mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ + Scheme: scheme, + Metrics: metricsServerOptions, + WebhookServer: webhookServer, + HealthProbeBindAddress: probeAddr, + LeaderElection: enableLeaderElection, + LeaderElectionID: "85104305.tensor-fusion.ai", + // LeaderElectionReleaseOnCancel defines if the leader should step down voluntarily + // when the Manager ends. This requires the binary to immediately end when the + // Manager is stopped, otherwise, this setting is unsafe. Setting this significantly + // speeds up voluntary leader transitions as the new leader don't have to wait + // LeaseDuration time first. + // + // In the default scaffold provided, the program ends immediately after + // the manager stops, so would be fine to enable this option. However, + // if you are doing or is intended to do any operation such as perform cleanups + // after the manager stops then its usage might be unsafe. + // LeaderElectionReleaseOnCancel: true, + }) + if err != nil { + setupLog.Error(err, "unable to start manager") + os.Exit(1) + } + + // +kubebuilder:scaffold:builder + + if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { + setupLog.Error(err, "unable to set up health check") + os.Exit(1) + } + if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil { + setupLog.Error(err, "unable to set up ready check") + os.Exit(1) + } + + setupLog.Info("starting manager") + if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { + setupLog.Error(err, "problem running manager") + os.Exit(1) + } +} diff --git a/config/default/kustomization.yaml b/config/default/kustomization.yaml new file mode 100644 index 0000000..c27f571 --- /dev/null +++ b/config/default/kustomization.yaml @@ -0,0 +1,177 @@ +# Adds namespace to all resources. +namespace: tensor-fusion-operator-system + +# Value of this field is prepended to the +# names of all resources, e.g. a deployment named +# "wordpress" becomes "alices-wordpress". +# Note that it should also match with the prefix (text before '-') of the namespace +# field above. +namePrefix: tensor-fusion-operator- + +# Labels to add to all resources and selectors. +#labels: +#- includeSelectors: true +# pairs: +# someName: someValue + +resources: +#- ../crd +- ../rbac +- ../manager +# [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in +# crd/kustomization.yaml +#- ../webhook +# [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER'. 'WEBHOOK' components are required. +#- ../certmanager +# [PROMETHEUS] To enable prometheus monitor, uncomment all sections with 'PROMETHEUS'. +#- ../prometheus +# [METRICS] Expose the controller manager metrics service. +- metrics_service.yaml +# [NETWORK POLICY] Protect the /metrics endpoint and Webhook Server with NetworkPolicy. +# Only Pod(s) running a namespace labeled with 'metrics: enabled' will be able to gather the metrics. +# Only CR(s) which requires webhooks and are applied on namespaces labeled with 'webhooks: enabled' will +# be able to communicate with the Webhook Server. +#- ../network-policy + +# Uncomment the patches line if you enable Metrics, and/or are using webhooks and cert-manager +patches: +# [METRICS] The following patch will enable the metrics endpoint using HTTPS and the port :8443. +# More info: https://book.kubebuilder.io/reference/metrics +- path: manager_metrics_patch.yaml + target: + kind: Deployment + +# [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in +# crd/kustomization.yaml +#- path: manager_webhook_patch.yaml + +# [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER' prefix. +# Uncomment the following replacements to add the cert-manager CA injection annotations +#replacements: +# - source: # Uncomment the following block if you have any webhook +# kind: Service +# version: v1 +# name: webhook-service +# fieldPath: .metadata.name # Name of the service +# targets: +# - select: +# kind: Certificate +# group: cert-manager.io +# version: v1 +# fieldPaths: +# - .spec.dnsNames.0 +# - .spec.dnsNames.1 +# options: +# delimiter: '.' +# index: 0 +# create: true +# - source: +# kind: Service +# version: v1 +# name: webhook-service +# fieldPath: .metadata.namespace # Namespace of the service +# targets: +# - select: +# kind: Certificate +# group: cert-manager.io +# version: v1 +# fieldPaths: +# - .spec.dnsNames.0 +# - .spec.dnsNames.1 +# options: +# delimiter: '.' +# index: 1 +# create: true +# +# - source: # Uncomment the following block if you have a ValidatingWebhook (--programmatic-validation) +# kind: Certificate +# group: cert-manager.io +# version: v1 +# name: serving-cert # This name should match the one in certificate.yaml +# fieldPath: .metadata.namespace # Namespace of the certificate CR +# targets: +# - select: +# kind: ValidatingWebhookConfiguration +# fieldPaths: +# - .metadata.annotations.[cert-manager.io/inject-ca-from] +# options: +# delimiter: '/' +# index: 0 +# create: true +# - source: +# kind: Certificate +# group: cert-manager.io +# version: v1 +# name: serving-cert # This name should match the one in certificate.yaml +# fieldPath: .metadata.name +# targets: +# - select: +# kind: ValidatingWebhookConfiguration +# fieldPaths: +# - .metadata.annotations.[cert-manager.io/inject-ca-from] +# options: +# delimiter: '/' +# index: 1 +# create: true +# +# - source: # Uncomment the following block if you have a DefaultingWebhook (--defaulting ) +# kind: Certificate +# group: cert-manager.io +# version: v1 +# name: serving-cert # This name should match the one in certificate.yaml +# fieldPath: .metadata.namespace # Namespace of the certificate CR +# targets: +# - select: +# kind: MutatingWebhookConfiguration +# fieldPaths: +# - .metadata.annotations.[cert-manager.io/inject-ca-from] +# options: +# delimiter: '/' +# index: 0 +# create: true +# - source: +# kind: Certificate +# group: cert-manager.io +# version: v1 +# name: serving-cert # This name should match the one in certificate.yaml +# fieldPath: .metadata.name +# targets: +# - select: +# kind: MutatingWebhookConfiguration +# fieldPaths: +# - .metadata.annotations.[cert-manager.io/inject-ca-from] +# options: +# delimiter: '/' +# index: 1 +# create: true +# +# - source: # Uncomment the following block if you have a ConversionWebhook (--conversion) +# kind: Certificate +# group: cert-manager.io +# version: v1 +# name: serving-cert # This name should match the one in certificate.yaml +# fieldPath: .metadata.namespace # Namespace of the certificate CR +# targets: +# - select: +# kind: CustomResourceDefinition +# fieldPaths: +# - .metadata.annotations.[cert-manager.io/inject-ca-from] +# options: +# delimiter: '/' +# index: 0 +# create: true +# - source: +# kind: Certificate +# group: cert-manager.io +# version: v1 +# name: serving-cert # This name should match the one in certificate.yaml +# fieldPath: .metadata.name +# targets: +# - select: +# kind: CustomResourceDefinition +# fieldPaths: +# - .metadata.annotations.[cert-manager.io/inject-ca-from] +# options: +# delimiter: '/' +# index: 1 +# create: true diff --git a/config/default/manager_metrics_patch.yaml b/config/default/manager_metrics_patch.yaml new file mode 100644 index 0000000..2aaef65 --- /dev/null +++ b/config/default/manager_metrics_patch.yaml @@ -0,0 +1,4 @@ +# This patch adds the args to allow exposing the metrics endpoint using HTTPS +- op: add + path: /spec/template/spec/containers/0/args/0 + value: --metrics-bind-address=:8443 diff --git a/config/default/metrics_service.yaml b/config/default/metrics_service.yaml new file mode 100644 index 0000000..82a1a1b --- /dev/null +++ b/config/default/metrics_service.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + control-plane: controller-manager + app.kubernetes.io/name: tensor-fusion-operator + app.kubernetes.io/managed-by: kustomize + name: controller-manager-metrics-service + namespace: system +spec: + ports: + - name: https + port: 8443 + protocol: TCP + targetPort: 8443 + selector: + control-plane: controller-manager diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml new file mode 100644 index 0000000..5c5f0b8 --- /dev/null +++ b/config/manager/kustomization.yaml @@ -0,0 +1,2 @@ +resources: +- manager.yaml diff --git a/config/manager/manager.yaml b/config/manager/manager.yaml new file mode 100644 index 0000000..1286879 --- /dev/null +++ b/config/manager/manager.yaml @@ -0,0 +1,95 @@ +apiVersion: v1 +kind: Namespace +metadata: + labels: + control-plane: controller-manager + app.kubernetes.io/name: tensor-fusion-operator + app.kubernetes.io/managed-by: kustomize + name: system +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: controller-manager + namespace: system + labels: + control-plane: controller-manager + app.kubernetes.io/name: tensor-fusion-operator + app.kubernetes.io/managed-by: kustomize +spec: + selector: + matchLabels: + control-plane: controller-manager + replicas: 1 + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: manager + labels: + control-plane: controller-manager + spec: + # TODO(user): Uncomment the following code to configure the nodeAffinity expression + # according to the platforms which are supported by your solution. + # It is considered best practice to support multiple architectures. You can + # build your manager image using the makefile target docker-buildx. + # affinity: + # nodeAffinity: + # requiredDuringSchedulingIgnoredDuringExecution: + # nodeSelectorTerms: + # - matchExpressions: + # - key: kubernetes.io/arch + # operator: In + # values: + # - amd64 + # - arm64 + # - ppc64le + # - s390x + # - key: kubernetes.io/os + # operator: In + # values: + # - linux + securityContext: + runAsNonRoot: true + # TODO(user): For common cases that do not require escalating privileges + # it is recommended to ensure that all your Pods/Containers are restrictive. + # More info: https://kubernetes.io/docs/concepts/security/pod-security-standards/#restricted + # Please uncomment the following code if your project does NOT have to work on old Kubernetes + # versions < 1.19 or on vendors versions which do NOT support this field by default (i.e. Openshift < 4.11 ). + # seccompProfile: + # type: RuntimeDefault + containers: + - command: + - /manager + args: + - --leader-elect + - --health-probe-bind-address=:8081 + image: controller:latest + name: manager + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - "ALL" + livenessProbe: + httpGet: + path: /healthz + port: 8081 + initialDelaySeconds: 15 + periodSeconds: 20 + readinessProbe: + httpGet: + path: /readyz + port: 8081 + initialDelaySeconds: 5 + periodSeconds: 10 + # TODO(user): Configure the resources accordingly based on the project requirements. + # More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + resources: + limits: + cpu: 500m + memory: 128Mi + requests: + cpu: 10m + memory: 64Mi + serviceAccountName: controller-manager + terminationGracePeriodSeconds: 10 diff --git a/config/network-policy/allow-metrics-traffic.yaml b/config/network-policy/allow-metrics-traffic.yaml new file mode 100644 index 0000000..e6e9d7f --- /dev/null +++ b/config/network-policy/allow-metrics-traffic.yaml @@ -0,0 +1,26 @@ +# This NetworkPolicy allows ingress traffic +# with Pods running on namespaces labeled with 'metrics: enabled'. Only Pods on those +# namespaces are able to gathering data from the metrics endpoint. +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + labels: + app.kubernetes.io/name: tensor-fusion-operator + app.kubernetes.io/managed-by: kustomize + name: allow-metrics-traffic + namespace: system +spec: + podSelector: + matchLabels: + control-plane: controller-manager + policyTypes: + - Ingress + ingress: + # This allows ingress traffic from any namespace with the label metrics: enabled + - from: + - namespaceSelector: + matchLabels: + metrics: enabled # Only from namespaces with this label + ports: + - port: 8443 + protocol: TCP diff --git a/config/network-policy/kustomization.yaml b/config/network-policy/kustomization.yaml new file mode 100644 index 0000000..ec0fb5e --- /dev/null +++ b/config/network-policy/kustomization.yaml @@ -0,0 +1,2 @@ +resources: +- allow-metrics-traffic.yaml diff --git a/config/prometheus/kustomization.yaml b/config/prometheus/kustomization.yaml new file mode 100644 index 0000000..ed13716 --- /dev/null +++ b/config/prometheus/kustomization.yaml @@ -0,0 +1,2 @@ +resources: +- monitor.yaml diff --git a/config/prometheus/monitor.yaml b/config/prometheus/monitor.yaml new file mode 100644 index 0000000..f732325 --- /dev/null +++ b/config/prometheus/monitor.yaml @@ -0,0 +1,30 @@ +# Prometheus Monitor Service (Metrics) +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + control-plane: controller-manager + app.kubernetes.io/name: tensor-fusion-operator + app.kubernetes.io/managed-by: kustomize + name: controller-manager-metrics-monitor + namespace: system +spec: + endpoints: + - path: /metrics + port: https # Ensure this is the name of the port that exposes HTTPS metrics + scheme: https + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + tlsConfig: + # TODO(user): The option insecureSkipVerify: true is not recommended for production since it disables + # certificate verification. This poses a significant security risk by making the system vulnerable to + # man-in-the-middle attacks, where an attacker could intercept and manipulate the communication between + # Prometheus and the monitored services. This could lead to unauthorized access to sensitive metrics data, + # compromising the integrity and confidentiality of the information. + # Please use the following options for secure configurations: + # caFile: /etc/metrics-certs/ca.crt + # certFile: /etc/metrics-certs/tls.crt + # keyFile: /etc/metrics-certs/tls.key + insecureSkipVerify: true + selector: + matchLabels: + control-plane: controller-manager diff --git a/config/rbac/kustomization.yaml b/config/rbac/kustomization.yaml new file mode 100644 index 0000000..5619aa0 --- /dev/null +++ b/config/rbac/kustomization.yaml @@ -0,0 +1,20 @@ +resources: +# All RBAC will be applied under this service account in +# the deployment namespace. You may comment out this resource +# if your manager will use a service account that exists at +# runtime. Be sure to update RoleBinding and ClusterRoleBinding +# subjects if changing service account names. +- service_account.yaml +- role.yaml +- role_binding.yaml +- leader_election_role.yaml +- leader_election_role_binding.yaml +# The following RBAC configurations are used to protect +# the metrics endpoint with authn/authz. These configurations +# ensure that only authorized users and service accounts +# can access the metrics endpoint. Comment the following +# permissions if you want to disable this protection. +# More info: https://book.kubebuilder.io/reference/metrics.html +- metrics_auth_role.yaml +- metrics_auth_role_binding.yaml +- metrics_reader_role.yaml diff --git a/config/rbac/leader_election_role.yaml b/config/rbac/leader_election_role.yaml new file mode 100644 index 0000000..d77f50e --- /dev/null +++ b/config/rbac/leader_election_role.yaml @@ -0,0 +1,40 @@ +# permissions to do leader election. +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + labels: + app.kubernetes.io/name: tensor-fusion-operator + app.kubernetes.io/managed-by: kustomize + name: leader-election-role +rules: +- apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch + - create + - update + - patch + - delete +- apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - get + - list + - watch + - create + - update + - patch + - delete +- apiGroups: + - "" + resources: + - events + verbs: + - create + - patch diff --git a/config/rbac/leader_election_role_binding.yaml b/config/rbac/leader_election_role_binding.yaml new file mode 100644 index 0000000..9e4dd73 --- /dev/null +++ b/config/rbac/leader_election_role_binding.yaml @@ -0,0 +1,15 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + labels: + app.kubernetes.io/name: tensor-fusion-operator + app.kubernetes.io/managed-by: kustomize + name: leader-election-rolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: leader-election-role +subjects: +- kind: ServiceAccount + name: controller-manager + namespace: system diff --git a/config/rbac/metrics_auth_role.yaml b/config/rbac/metrics_auth_role.yaml new file mode 100644 index 0000000..32d2e4e --- /dev/null +++ b/config/rbac/metrics_auth_role.yaml @@ -0,0 +1,17 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: metrics-auth-role +rules: +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create diff --git a/config/rbac/metrics_auth_role_binding.yaml b/config/rbac/metrics_auth_role_binding.yaml new file mode 100644 index 0000000..e775d67 --- /dev/null +++ b/config/rbac/metrics_auth_role_binding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: metrics-auth-rolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: metrics-auth-role +subjects: +- kind: ServiceAccount + name: controller-manager + namespace: system diff --git a/config/rbac/metrics_reader_role.yaml b/config/rbac/metrics_reader_role.yaml new file mode 100644 index 0000000..51a75db --- /dev/null +++ b/config/rbac/metrics_reader_role.yaml @@ -0,0 +1,9 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: metrics-reader +rules: +- nonResourceURLs: + - "/metrics" + verbs: + - get diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml new file mode 100644 index 0000000..7454ff6 --- /dev/null +++ b/config/rbac/role.yaml @@ -0,0 +1,11 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/name: tensor-fusion-operator + app.kubernetes.io/managed-by: kustomize + name: manager-role +rules: +- apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "watch"] diff --git a/config/rbac/role_binding.yaml b/config/rbac/role_binding.yaml new file mode 100644 index 0000000..6087d7e --- /dev/null +++ b/config/rbac/role_binding.yaml @@ -0,0 +1,15 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/name: tensor-fusion-operator + app.kubernetes.io/managed-by: kustomize + name: manager-rolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: manager-role +subjects: +- kind: ServiceAccount + name: controller-manager + namespace: system diff --git a/config/rbac/service_account.yaml b/config/rbac/service_account.yaml new file mode 100644 index 0000000..20beb76 --- /dev/null +++ b/config/rbac/service_account.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + app.kubernetes.io/name: tensor-fusion-operator + app.kubernetes.io/managed-by: kustomize + name: controller-manager + namespace: system diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..c5f3936 --- /dev/null +++ b/go.mod @@ -0,0 +1,98 @@ +module github.com/NexusGPU/tensor-fusion-operator + +go 1.22.0 + +require ( + github.com/onsi/ginkgo/v2 v2.19.0 + github.com/onsi/gomega v1.33.1 + k8s.io/apimachinery v0.31.0 + k8s.io/client-go v0.31.0 + sigs.k8s.io/controller-runtime v0.19.1 +) + +require ( + github.com/antlr4-go/antlr/v4 v4.13.0 // indirect + github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a // indirect + github.com/beorn7/perks v1.0.1 // indirect + github.com/blang/semver/v4 v4.0.0 // indirect + github.com/cenkalti/backoff/v4 v4.3.0 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect + github.com/emicklei/go-restful/v3 v3.11.0 // indirect + github.com/evanphx/json-patch/v5 v5.9.0 // indirect + github.com/felixge/httpsnoop v1.0.4 // indirect + github.com/fsnotify/fsnotify v1.7.0 // indirect + github.com/fxamacker/cbor/v2 v2.7.0 // indirect + github.com/go-logr/logr v1.4.2 // indirect + github.com/go-logr/stdr v1.2.2 // indirect + github.com/go-logr/zapr v1.3.0 // indirect + github.com/go-openapi/jsonpointer v0.19.6 // indirect + github.com/go-openapi/jsonreference v0.20.2 // indirect + github.com/go-openapi/swag v0.22.4 // indirect + github.com/go-task/slim-sprig/v3 v3.0.0 // indirect + github.com/gogo/protobuf v1.3.2 // indirect + github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect + github.com/golang/protobuf v1.5.4 // indirect + github.com/google/cel-go v0.20.1 // indirect + github.com/google/gnostic-models v0.6.8 // indirect + github.com/google/go-cmp v0.6.0 // indirect + github.com/google/gofuzz v1.2.0 // indirect + github.com/google/pprof v0.0.0-20240525223248-4bfdf5a9a2af // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 // indirect + github.com/imdario/mergo v0.3.6 // indirect + github.com/inconshreveable/mousetrap v1.1.0 // indirect + github.com/josharian/intern v1.0.0 // indirect + github.com/json-iterator/go v1.1.12 // indirect + github.com/mailru/easyjson v0.7.7 // indirect + github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect + github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/pkg/errors v0.9.1 // indirect + github.com/prometheus/client_golang v1.19.1 // indirect + github.com/prometheus/client_model v0.6.1 // indirect + github.com/prometheus/common v0.55.0 // indirect + github.com/prometheus/procfs v0.15.1 // indirect + github.com/spf13/cobra v1.8.1 // indirect + github.com/spf13/pflag v1.0.5 // indirect + github.com/stoewer/go-strcase v1.2.0 // indirect + github.com/x448/float16 v0.8.4 // indirect + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0 // indirect + go.opentelemetry.io/otel v1.28.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.28.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.27.0 // indirect + go.opentelemetry.io/otel/metric v1.28.0 // indirect + go.opentelemetry.io/otel/sdk v1.28.0 // indirect + go.opentelemetry.io/otel/trace v1.28.0 // indirect + go.opentelemetry.io/proto/otlp v1.3.1 // indirect + go.uber.org/multierr v1.11.0 // indirect + go.uber.org/zap v1.26.0 // indirect + golang.org/x/exp v0.0.0-20230515195305-f3d0a9c9a5cc // indirect + golang.org/x/net v0.26.0 // indirect + golang.org/x/oauth2 v0.21.0 // indirect + golang.org/x/sync v0.7.0 // indirect + golang.org/x/sys v0.21.0 // indirect + golang.org/x/term v0.21.0 // indirect + golang.org/x/text v0.16.0 // indirect + golang.org/x/time v0.3.0 // indirect + golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d // indirect + gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20240528184218-531527333157 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20240701130421-f6361c86f094 // indirect + google.golang.org/grpc v1.65.0 // indirect + google.golang.org/protobuf v1.34.2 // indirect + gopkg.in/inf.v0 v0.9.1 // indirect + gopkg.in/yaml.v2 v2.4.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect + k8s.io/api v0.31.0 // indirect + k8s.io/apiextensions-apiserver v0.31.0 // indirect + k8s.io/apiserver v0.31.0 // indirect + k8s.io/component-base v0.31.0 // indirect + k8s.io/klog/v2 v2.130.1 // indirect + k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 // indirect + k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 // indirect + sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.30.3 // indirect + sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect + sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect + sigs.k8s.io/yaml v1.4.0 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..0958667 --- /dev/null +++ b/go.sum @@ -0,0 +1,251 @@ +github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8TVTI= +github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g= +github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a h1:idn718Q4B6AGu/h5Sxe66HYVdqdGu2l9Iebqhi/AEoA= +github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY= +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= +github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= +github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= +github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/emicklei/go-restful/v3 v3.11.0 h1:rAQeMHw1c7zTmncogyy8VvRZwtkmkZ4FxERmMY4rD+g= +github.com/emicklei/go-restful/v3 v3.11.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/evanphx/json-patch v0.5.2 h1:xVCHIVMUu1wtM/VkR9jVZ45N3FhZfYMMYGorLCR8P3k= +github.com/evanphx/json-patch v0.5.2/go.mod h1:ZWS5hhDbVDyob71nXKNL0+PWn6ToqBHMikGIFbs31qQ= +github.com/evanphx/json-patch/v5 v5.9.0 h1:kcBlZQbplgElYIlo/n1hJbls2z/1awpXxpRi0/FOJfg= +github.com/evanphx/json-patch/v5 v5.9.0/go.mod h1:VNkHZ/282BpEyt/tObQO8s5CMPmYYq14uClGH4abBuQ= +github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= +github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= +github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= +github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= +github.com/fxamacker/cbor/v2 v2.7.0 h1:iM5WgngdRBanHcxugY4JySA0nk1wZorNOpTgCMedv5E= +github.com/fxamacker/cbor/v2 v2.7.0/go.mod h1:pxXPTn3joSm21Gbwsv0w9OSA2y1HFR9qXEeXQVeNoDQ= +github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= +github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ= +github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg= +github.com/go-openapi/jsonpointer v0.19.6 h1:eCs3fxoIi3Wh6vtgmLTOjdhSpiqphQ+DaPn38N2ZdrE= +github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs= +github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE= +github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k= +github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14= +github.com/go-openapi/swag v0.22.4 h1:QLMzNJnMGPRNDCbySlcj1x01tzU8/9LTTL9hZZZogBU= +github.com/go-openapi/swag v0.22.4/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14= +github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= +github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= +github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= +github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/google/cel-go v0.20.1 h1:nDx9r8S3L4pE61eDdt8igGj8rf5kjYR3ILxWIpWNi84= +github.com/google/cel-go v0.20.1/go.mod h1:kWcIzTsPX0zmQ+H3TirHstLLf9ep5QTsZBN9u4dOYLg= +github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I= +github.com/google/gnostic-models v0.6.8/go.mod h1:5n7qKqH0f5wFt+aWF8CW6pZLLNOfYuF5OpfBSENuI8U= +github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= +github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/pprof v0.0.0-20240525223248-4bfdf5a9a2af h1:kmjWCqn2qkEml422C2Rrd27c3VGxi6a/6HNq8QmHRKM= +github.com/google/pprof v0.0.0-20240525223248-4bfdf5a9a2af/go.mod h1:K1liHPHnj73Fdn/EKuT8nrFqBihUSKXoLYU0BuatOYo= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 h1:bkypFPDjIYGfCYD5mRBvpqxfYX1YCS1PXdKYWi8FsN0= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0/go.mod h1:P+Lt/0by1T8bfcF3z737NnSbmxQAppXMRziHUxPOC8k= +github.com/imdario/mergo v0.3.6 h1:xTNEAn+kxVO7dTZGu0CegyqKZmoWFI0rF8UxjlB2d28= +github.com/imdario/mergo v0.3.6/go.mod h1:2EnlNZ0deacrJVfApfmtdGgDfMuh/nq6Ok1EcJh5FfA= +github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= +github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= +github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= +github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= +github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= +github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= +github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= +github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= +github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= +github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/onsi/ginkgo/v2 v2.19.0 h1:9Cnnf7UHo57Hy3k6/m5k3dRfGTMXGvxhHFvkDTCTpvA= +github.com/onsi/ginkgo/v2 v2.19.0/go.mod h1:rlwLi9PilAFJ8jCg9UE1QP6VBpd6/xj3SRC0d6TU0To= +github.com/onsi/gomega v1.33.1 h1:dsYjIxxSR755MDmKVsaFQTE22ChNBcuuTWgkUDSubOk= +github.com/onsi/gomega v1.33.1/go.mod h1:U4R44UsT+9eLIaYRB2a5qajjtQYn0hauxvRm16AVYg0= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_golang v1.19.1 h1:wZWJDwK+NameRJuPGDhlnFgx8e8HN3XHQeLaYJFJBOE= +github.com/prometheus/client_golang v1.19.1/go.mod h1:mP78NwGzrVks5S2H6ab8+ZZGJLZUq1hoULYBAYBw1Ho= +github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= +github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= +github.com/prometheus/common v0.55.0 h1:KEi6DK7lXW/m7Ig5i47x0vRzuBsHuvJdi5ee6Y3G1dc= +github.com/prometheus/common v0.55.0/go.mod h1:2SECS4xJG1kd8XF9IcM1gMX6510RAEL65zxzNImwdc8= +github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= +github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= +github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8= +github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4= +github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= +github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM= +github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y= +github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= +github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/stoewer/go-strcase v1.2.0 h1:Z2iHWqGXH00XYgqDmNgQbIBxf3wrNq0F3feEy0ainaU= +github.com/stoewer/go-strcase v1.2.0/go.mod h1:IBiWB2sKIp3wVVQ3Y035++gc+knqhUQag1KpM8ahLw8= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= +github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= +github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0 h1:4K4tsIXefpVJtvA/8srF4V4y0akAoPHkIslgAkjixJA= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0/go.mod h1:jjdQuTGVsXV4vSs+CJ2qYDeDPf9yIJV23qlIzBm73Vg= +go.opentelemetry.io/otel v1.28.0 h1:/SqNcYk+idO0CxKEUOtKQClMK/MimZihKYMruSMViUo= +go.opentelemetry.io/otel v1.28.0/go.mod h1:q68ijF8Fc8CnMHKyzqL6akLO46ePnjkgfIMIjUIX9z4= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.28.0 h1:3Q/xZUyC1BBkualc9ROb4G8qkH90LXEIICcs5zv1OYY= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.28.0/go.mod h1:s75jGIWA9OfCMzF0xr+ZgfrB5FEbbV7UuYo32ahUiFI= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.27.0 h1:qFffATk0X+HD+f1Z8lswGiOQYKHRlzfmdJm0wEaVrFA= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.27.0/go.mod h1:MOiCmryaYtc+V0Ei+Tx9o5S1ZjA7kzLucuVuyzBZloQ= +go.opentelemetry.io/otel/metric v1.28.0 h1:f0HGvSl1KRAU1DLgLGFjrwVyismPlnuU6JD6bOeuA5Q= +go.opentelemetry.io/otel/metric v1.28.0/go.mod h1:Fb1eVBFZmLVTMb6PPohq3TO9IIhUisDsbJoL/+uQW4s= +go.opentelemetry.io/otel/sdk v1.28.0 h1:b9d7hIry8yZsgtbmM0DKyPWMMUMlK9NEKuIG4aBqWyE= +go.opentelemetry.io/otel/sdk v1.28.0/go.mod h1:oYj7ClPUA7Iw3m+r7GeEjz0qckQRJK2B8zjcZEfu7Pg= +go.opentelemetry.io/otel/trace v1.28.0 h1:GhQ9cUuQGmNDd5BTCP2dAvv75RdMxEfTmYejp+lkx9g= +go.opentelemetry.io/otel/trace v1.28.0/go.mod h1:jPyXzNPg6da9+38HEwElrQiHlVMTnVfM3/yv2OlIHaI= +go.opentelemetry.io/proto/otlp v1.3.1 h1:TrMUixzpM0yuc/znrFTP9MMRh8trP93mkCiDVeXrui0= +go.opentelemetry.io/proto/otlp v1.3.1/go.mod h1:0X1WI4de4ZsLrrJNLAQbFeLCm3T7yBkR0XqQ7niQU+8= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= +go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= +go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= +go.uber.org/zap v1.26.0 h1:sI7k6L95XOKS281NhVKOFCUNIvv9e0w4BF8N3u+tCRo= +go.uber.org/zap v1.26.0/go.mod h1:dtElttAiwGvoJ/vj4IwHBS/gXsEu/pZ50mUIRWuG0so= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/exp v0.0.0-20230515195305-f3d0a9c9a5cc h1:mCRnTeVUjcrhlRmO0VK8a6k6Rrf6TF9htwo2pJVSjIU= +golang.org/x/exp v0.0.0-20230515195305-f3d0a9c9a5cc/go.mod h1:V1LtkGg67GoY2N1AnLN78QLrzxkLyJw7RJb1gzOOz9w= +golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ= +golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE= +golang.org/x/oauth2 v0.21.0 h1:tsimM75w1tF/uws5rbeHzIWxEqElMehnc+iW793zsZs= +golang.org/x/oauth2 v0.21.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M= +golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws= +golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.21.0 h1:WVXCp+/EBEHOj53Rvu+7KiT/iElMrO8ACK16SMZ3jaA= +golang.org/x/term v0.21.0/go.mod h1:ooXLefLobQVslOqselCNF4SxFAaoS6KujMbsGzSDmX0= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4= +golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= +golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4= +golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= +golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d h1:vU5i/LfpvrRCpgM/VPfJLg5KjxD3E+hfT1SH+d9zLwg= +golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw= +gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= +google.golang.org/genproto/googleapis/api v0.0.0-20240528184218-531527333157 h1:7whR9kGa5LUwFtpLm2ArCEejtnxlGeLbAyjFY8sGNFw= +google.golang.org/genproto/googleapis/api v0.0.0-20240528184218-531527333157/go.mod h1:99sLkeliLXfdj2J75X3Ho+rrVCaJze0uwN7zDDkjPVU= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240701130421-f6361c86f094 h1:BwIjyKYGsK9dMCBOorzRri8MQwmi7mT9rGHsCEinZkA= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240701130421-f6361c86f094/go.mod h1:Ue6ibwXGpU+dqIcODieyLOcgj7z8+IcskoNIgZxtrFY= +google.golang.org/grpc v1.65.0 h1:bs/cUb4lp1G5iImFFd3u5ixQzweKizoZJAwBNLR42lc= +google.golang.org/grpc v1.65.0/go.mod h1:WgYC2ypjlB0EiQi6wdKixMqukr6lBc0Vo+oOgjrM5ZQ= +google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg= +google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/evanphx/json-patch.v4 v4.12.0 h1:n6jtcsulIzXPJaxegRbvFNNrZDjbij7ny3gmSPG+6V4= +gopkg.in/evanphx/json-patch.v4 v4.12.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= +gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= +gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= +gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +k8s.io/api v0.31.0 h1:b9LiSjR2ym/SzTOlfMHm1tr7/21aD7fSkqgD/CVJBCo= +k8s.io/api v0.31.0/go.mod h1:0YiFF+JfFxMM6+1hQei8FY8M7s1Mth+z/q7eF1aJkTE= +k8s.io/apiextensions-apiserver v0.31.0 h1:fZgCVhGwsclj3qCw1buVXCV6khjRzKC5eCFt24kyLSk= +k8s.io/apiextensions-apiserver v0.31.0/go.mod h1:b9aMDEYaEe5sdK+1T0KU78ApR/5ZVp4i56VacZYEHxk= +k8s.io/apimachinery v0.31.0 h1:m9jOiSr3FoSSL5WO9bjm1n6B9KROYYgNZOb4tyZ1lBc= +k8s.io/apimachinery v0.31.0/go.mod h1:rsPdaZJfTfLsNJSQzNHQvYoTmxhoOEofxtOsF3rtsMo= +k8s.io/apiserver v0.31.0 h1:p+2dgJjy+bk+B1Csz+mc2wl5gHwvNkC9QJV+w55LVrY= +k8s.io/apiserver v0.31.0/go.mod h1:KI9ox5Yu902iBnnyMmy7ajonhKnkeZYJhTZ/YI+WEMk= +k8s.io/client-go v0.31.0 h1:QqEJzNjbN2Yv1H79SsS+SWnXkBgVu4Pj3CJQgbx0gI8= +k8s.io/client-go v0.31.0/go.mod h1:Y9wvC76g4fLjmU0BA+rV+h2cncoadjvjjkkIGoTLcGU= +k8s.io/component-base v0.31.0 h1:/KIzGM5EvPNQcYgwq5NwoQBaOlVFrghoVGr8lG6vNRs= +k8s.io/component-base v0.31.0/go.mod h1:TYVuzI1QmN4L5ItVdMSXKvH7/DtvIuas5/mm8YT3rTo= +k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= +k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= +k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 h1:BZqlfIlq5YbRMFko6/PM7FjZpUb45WallggurYhKGag= +k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340/go.mod h1:yD4MZYeKMBwQKVht279WycxKyM84kkAx2DPrTXaeb98= +k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 h1:pUdcCO1Lk/tbT5ztQWOBi5HBgbBP1J8+AsQnQCKsi8A= +k8s.io/utils v0.0.0-20240711033017-18e509b52bc8/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.30.3 h1:2770sDpzrjjsAtVhSeUFseziht227YAWYHLGNM8QPwY= +sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.30.3/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw= +sigs.k8s.io/controller-runtime v0.19.1 h1:Son+Q40+Be3QWb+niBXAg2vFiYWolDjjRfO8hn/cxOk= +sigs.k8s.io/controller-runtime v0.19.1/go.mod h1:iRmWllt8IlaLjvTTDLhRBXIEtkCK6hwVBJJsYS9Ajf4= +sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= +sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= +sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+sGiqlzvrtq4= +sigs.k8s.io/structured-merge-diff/v4 v4.4.1/go.mod h1:N8hJocpFajUSSeSJ9bOZ77VzejKZaXsTtZo4/u7Io08= +sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= +sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= diff --git a/hack/boilerplate.go.txt b/hack/boilerplate.go.txt new file mode 100644 index 0000000..ff72ff2 --- /dev/null +++ b/hack/boilerplate.go.txt @@ -0,0 +1,15 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ \ No newline at end of file diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/e2e_suite_test.go new file mode 100644 index 0000000..0b72e40 --- /dev/null +++ b/test/e2e/e2e_suite_test.go @@ -0,0 +1,120 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e + +import ( + "fmt" + "os" + "os/exec" + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/NexusGPU/tensor-fusion-operator/test/utils" +) + +var ( + // Optional Environment Variables: + // - PROMETHEUS_INSTALL_SKIP=true: Skips Prometheus Operator installation during test setup. + // - CERT_MANAGER_INSTALL_SKIP=true: Skips CertManager installation during test setup. + // These variables are useful if Prometheus or CertManager is already installed, avoiding + // re-installation and conflicts. + skipPrometheusInstall = os.Getenv("PROMETHEUS_INSTALL_SKIP") == "true" + skipCertManagerInstall = os.Getenv("CERT_MANAGER_INSTALL_SKIP") == "true" + // isPrometheusOperatorAlreadyInstalled will be set true when prometheus CRDs be found on the cluster + isPrometheusOperatorAlreadyInstalled = false + // isCertManagerAlreadyInstalled will be set true when CertManager CRDs be found on the cluster + isCertManagerAlreadyInstalled = false + + // projectImage is the name of the image which will be build and loaded + // with the code source changes to be tested. + projectImage = "example.com/tensor-fusion-operator:v0.0.1" +) + +// TestE2E runs the end-to-end (e2e) test suite for the project. These tests execute in an isolated, +// temporary environment to validate project changes with the the purposed to be used in CI jobs. +// The default setup requires Kind, builds/loads the Manager Docker image locally, and installs +// CertManager and Prometheus. +func TestE2E(t *testing.T) { + RegisterFailHandler(Fail) + _, _ = fmt.Fprintf(GinkgoWriter, "Starting tensor-fusion-operator integration test suite\n") + RunSpecs(t, "e2e suite") +} + +var _ = BeforeSuite(func() { + By("Ensure that Prometheus is enabled") + _ = utils.UncommentCode("config/default/kustomization.yaml", "#- ../prometheus", "#") + + By("generating files") + cmd := exec.Command("make", "generate") + _, err := utils.Run(cmd) + ExpectWithOffset(1, err).NotTo(HaveOccurred(), "Failed to run make generate") + + By("generating manifests") + cmd = exec.Command("make", "manifests") + _, err = utils.Run(cmd) + ExpectWithOffset(1, err).NotTo(HaveOccurred(), "Failed to run make manifests") + + By("building the manager(Operator) image") + cmd = exec.Command("make", "docker-build", fmt.Sprintf("IMG=%s", projectImage)) + _, err = utils.Run(cmd) + ExpectWithOffset(1, err).NotTo(HaveOccurred(), "Failed to build the manager(Operator) image") + + // TODO(user): If you want to change the e2e test vendor from Kind, ensure the image is + // built and available before running the tests. Also, remove the following block. + By("loading the manager(Operator) image on Kind") + err = utils.LoadImageToKindClusterWithName(projectImage) + ExpectWithOffset(1, err).NotTo(HaveOccurred(), "Failed to load the manager(Operator) image into Kind") + + // The tests-e2e are intended to run on a temporary cluster that is created and destroyed for testing. + // To prevent errors when tests run in environments with Prometheus or CertManager already installed, + // we check for their presence before execution. + // Setup Prometheus and CertManager before the suite if not skipped and if not already installed + if !skipPrometheusInstall { + By("checking if prometheus is installed already") + isPrometheusOperatorAlreadyInstalled = utils.IsPrometheusCRDsInstalled() + if !isPrometheusOperatorAlreadyInstalled { + _, _ = fmt.Fprintf(GinkgoWriter, "Installing Prometheus Operator...\n") + Expect(utils.InstallPrometheusOperator()).To(Succeed(), "Failed to install Prometheus Operator") + } else { + _, _ = fmt.Fprintf(GinkgoWriter, "WARNING: Prometheus Operator is already installed. Skipping installation...\n") + } + } + if !skipCertManagerInstall { + By("checking if cert manager is installed already") + isCertManagerAlreadyInstalled = utils.IsCertManagerCRDsInstalled() + if !isCertManagerAlreadyInstalled { + _, _ = fmt.Fprintf(GinkgoWriter, "Installing CertManager...\n") + Expect(utils.InstallCertManager()).To(Succeed(), "Failed to install CertManager") + } else { + _, _ = fmt.Fprintf(GinkgoWriter, "WARNING: CertManager is already installed. Skipping installation...\n") + } + } +}) + +var _ = AfterSuite(func() { + // Teardown Prometheus and CertManager after the suite if not skipped and if they were not already installed + if !skipPrometheusInstall && !isPrometheusOperatorAlreadyInstalled { + _, _ = fmt.Fprintf(GinkgoWriter, "Uninstalling Prometheus Operator...\n") + utils.UninstallPrometheusOperator() + } + if !skipCertManagerInstall && !isCertManagerAlreadyInstalled { + _, _ = fmt.Fprintf(GinkgoWriter, "Uninstalling CertManager...\n") + utils.UninstallCertManager() + } +}) diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go new file mode 100644 index 0000000..a218480 --- /dev/null +++ b/test/e2e/e2e_test.go @@ -0,0 +1,307 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e + +import ( + "encoding/json" + "fmt" + "os" + "os/exec" + "path/filepath" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/NexusGPU/tensor-fusion-operator/test/utils" +) + +// namespace where the project is deployed in +const namespace = "tensor-fusion-operator-system" + +// serviceAccountName created for the project +const serviceAccountName = "tensor-fusion-operator-controller-manager" + +// metricsServiceName is the name of the metrics service of the project +const metricsServiceName = "tensor-fusion-operator-controller-manager-metrics-service" + +// metricsRoleBindingName is the name of the RBAC that will be created to allow get the metrics data +const metricsRoleBindingName = "tensor-fusion-operator-metrics-binding" + +var _ = Describe("Manager", Ordered, func() { + var controllerPodName string + + // Before running the tests, set up the environment by creating the namespace, + // installing CRDs, and deploying the controller. + BeforeAll(func() { + By("creating manager namespace") + cmd := exec.Command("kubectl", "create", "ns", namespace) + _, err := utils.Run(cmd) + Expect(err).NotTo(HaveOccurred(), "Failed to create namespace") + + By("installing CRDs") + cmd = exec.Command("make", "install") + _, err = utils.Run(cmd) + Expect(err).NotTo(HaveOccurred(), "Failed to install CRDs") + + By("deploying the controller-manager") + cmd = exec.Command("make", "deploy", fmt.Sprintf("IMG=%s", projectImage)) + _, err = utils.Run(cmd) + Expect(err).NotTo(HaveOccurred(), "Failed to deploy the controller-manager") + }) + + // After all tests have been executed, clean up by undeploying the controller, uninstalling CRDs, + // and deleting the namespace. + AfterAll(func() { + By("cleaning up the curl pod for metrics") + cmd := exec.Command("kubectl", "delete", "pod", "curl-metrics", "-n", namespace) + _, _ = utils.Run(cmd) + + By("undeploying the controller-manager") + cmd = exec.Command("make", "undeploy") + _, _ = utils.Run(cmd) + + By("uninstalling CRDs") + cmd = exec.Command("make", "uninstall") + _, _ = utils.Run(cmd) + + By("removing manager namespace") + cmd = exec.Command("kubectl", "delete", "ns", namespace) + _, _ = utils.Run(cmd) + }) + + // After each test, check for failures and collect logs, events, + // and pod descriptions for debugging. + AfterEach(func() { + specReport := CurrentSpecReport() + if specReport.Failed() { + By("Fetching controller manager pod logs") + cmd := exec.Command("kubectl", "logs", controllerPodName, "-n", namespace) + controllerLogs, err := utils.Run(cmd) + if err == nil { + _, _ = fmt.Fprintf(GinkgoWriter, fmt.Sprintf("Controller logs:\n %s", controllerLogs)) + } else { + _, _ = fmt.Fprintf(GinkgoWriter, fmt.Sprintf("Failed to get Controller logs: %s", err)) + } + + By("Fetching Kubernetes events") + cmd = exec.Command("kubectl", "get", "events", "-n", namespace, "--sort-by=.lastTimestamp") + eventsOutput, err := utils.Run(cmd) + if err == nil { + _, _ = fmt.Fprintf(GinkgoWriter, fmt.Sprintf("Kubernetes events:\n%s", eventsOutput)) + } else { + _, _ = fmt.Fprintf(GinkgoWriter, fmt.Sprintf("Failed to get Kubernetes events: %s", err)) + } + + By("Fetching curl-metrics logs") + cmd = exec.Command("kubectl", "logs", "curl-metrics", "-n", namespace) + metricsOutput, err := utils.Run(cmd) + if err == nil { + _, _ = fmt.Fprintf(GinkgoWriter, fmt.Sprintf("Metrics logs:\n %s", metricsOutput)) + } else { + _, _ = fmt.Fprintf(GinkgoWriter, fmt.Sprintf("Failed to get curl-metrics logs: %s", err)) + } + + By("Fetching controller manager pod description") + cmd = exec.Command("kubectl", "describe", "pod", controllerPodName, "-n", namespace) + podDescription, err := utils.Run(cmd) + if err == nil { + fmt.Println("Pod description:\n", podDescription) + } else { + fmt.Println("Failed to describe controller pod") + } + } + }) + + SetDefaultEventuallyTimeout(2 * time.Minute) + SetDefaultEventuallyPollingInterval(time.Second) + + Context("Manager", func() { + It("should run successfully", func() { + By("validating that the controller-manager pod is running as expected") + verifyControllerUp := func(g Gomega) { + // Get the name of the controller-manager pod + cmd := exec.Command("kubectl", "get", + "pods", "-l", "control-plane=controller-manager", + "-o", "go-template={{ range .items }}"+ + "{{ if not .metadata.deletionTimestamp }}"+ + "{{ .metadata.name }}"+ + "{{ \"\\n\" }}{{ end }}{{ end }}", + "-n", namespace, + ) + + podOutput, err := utils.Run(cmd) + g.Expect(err).NotTo(HaveOccurred(), "Failed to retrieve controller-manager pod information") + podNames := utils.GetNonEmptyLines(podOutput) + g.Expect(podNames).To(HaveLen(1), "expected 1 controller pod running") + controllerPodName = podNames[0] + g.Expect(controllerPodName).To(ContainSubstring("controller-manager")) + + // Validate the pod's status + cmd = exec.Command("kubectl", "get", + "pods", controllerPodName, "-o", "jsonpath={.status.phase}", + "-n", namespace, + ) + output, err := utils.Run(cmd) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(output).To(Equal("Running"), "Incorrect controller-manager pod status") + } + Eventually(verifyControllerUp).Should(Succeed()) + }) + + It("should ensure the metrics endpoint is serving metrics", func() { + By("creating a ClusterRoleBinding for the service account to allow access to metrics") + cmd := exec.Command("kubectl", "create", "clusterrolebinding", metricsRoleBindingName, + "--clusterrole=tensor-fusion-operator-metrics-reader", + fmt.Sprintf("--serviceaccount=%s:%s", namespace, serviceAccountName), + ) + _, err := utils.Run(cmd) + Expect(err).NotTo(HaveOccurred(), "Failed to create ClusterRoleBinding") + + By("validating that the metrics service is available") + cmd = exec.Command("kubectl", "get", "service", metricsServiceName, "-n", namespace) + _, err = utils.Run(cmd) + Expect(err).NotTo(HaveOccurred(), "Metrics service should exist") + + By("validating that the ServiceMonitor for Prometheus is applied in the namespace") + cmd = exec.Command("kubectl", "get", "ServiceMonitor", "-n", namespace) + _, err = utils.Run(cmd) + Expect(err).NotTo(HaveOccurred(), "ServiceMonitor should exist") + + By("getting the service account token") + token, err := serviceAccountToken() + Expect(err).NotTo(HaveOccurred()) + Expect(token).NotTo(BeEmpty()) + + By("waiting for the metrics endpoint to be ready") + verifyMetricsEndpointReady := func(g Gomega) { + cmd := exec.Command("kubectl", "get", "endpoints", metricsServiceName, "-n", namespace) + output, err := utils.Run(cmd) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(output).To(ContainSubstring("8443"), "Metrics endpoint is not ready") + } + Eventually(verifyMetricsEndpointReady).Should(Succeed()) + + By("verifying that the controller manager is serving the metrics server") + verifyMetricsServerStarted := func(g Gomega) { + cmd := exec.Command("kubectl", "logs", controllerPodName, "-n", namespace) + output, err := utils.Run(cmd) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(output).To(ContainSubstring("controller-runtime.metrics\tServing metrics server"), + "Metrics server not yet started") + } + Eventually(verifyMetricsServerStarted).Should(Succeed()) + + By("creating the curl-metrics pod to access the metrics endpoint") + cmd = exec.Command("kubectl", "run", "curl-metrics", "--restart=Never", + "--namespace", namespace, + "--image=curlimages/curl:7.78.0", + "--", "/bin/sh", "-c", fmt.Sprintf( + "curl -v -k -H 'Authorization: Bearer %s' https://%s.%s.svc.cluster.local:8443/metrics", + token, metricsServiceName, namespace)) + _, err = utils.Run(cmd) + Expect(err).NotTo(HaveOccurred(), "Failed to create curl-metrics pod") + + By("waiting for the curl-metrics pod to complete.") + verifyCurlUp := func(g Gomega) { + cmd := exec.Command("kubectl", "get", "pods", "curl-metrics", + "-o", "jsonpath={.status.phase}", + "-n", namespace) + output, err := utils.Run(cmd) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(output).To(Equal("Succeeded"), "curl pod in wrong status") + } + Eventually(verifyCurlUp, 5*time.Minute).Should(Succeed()) + + By("getting the metrics by checking curl-metrics logs") + metricsOutput := getMetricsOutput() + Expect(metricsOutput).To(ContainSubstring( + "controller_runtime_reconcile_total", + )) + }) + + // +kubebuilder:scaffold:e2e-webhooks-checks + + // TODO: Customize the e2e test suite with scenarios specific to your project. + // Consider applying sample/CR(s) and check their status and/or verifying + // the reconciliation by using the metrics, i.e.: + // metricsOutput := getMetricsOutput() + // Expect(metricsOutput).To(ContainSubstring( + // fmt.Sprintf(`controller_runtime_reconcile_total{controller="%s",result="success"} 1`, + // strings.ToLower(), + // )) + }) +}) + +// serviceAccountToken returns a token for the specified service account in the given namespace. +// It uses the Kubernetes TokenRequest API to generate a token by directly sending a request +// and parsing the resulting token from the API response. +func serviceAccountToken() (string, error) { + const tokenRequestRawString = `{ + "apiVersion": "authentication.k8s.io/v1", + "kind": "TokenRequest" + }` + + // Temporary file to store the token request + secretName := fmt.Sprintf("%s-token-request", serviceAccountName) + tokenRequestFile := filepath.Join("/tmp", secretName) + err := os.WriteFile(tokenRequestFile, []byte(tokenRequestRawString), os.FileMode(0o644)) + if err != nil { + return "", err + } + + var out string + verifyTokenCreation := func(g Gomega) { + // Execute kubectl command to create the token + cmd := exec.Command("kubectl", "create", "--raw", fmt.Sprintf( + "/api/v1/namespaces/%s/serviceaccounts/%s/token", + namespace, + serviceAccountName, + ), "-f", tokenRequestFile) + + output, err := cmd.CombinedOutput() + g.Expect(err).NotTo(HaveOccurred()) + + // Parse the JSON output to extract the token + var token tokenRequest + err = json.Unmarshal([]byte(output), &token) + g.Expect(err).NotTo(HaveOccurred()) + + out = token.Status.Token + } + Eventually(verifyTokenCreation).Should(Succeed()) + + return out, err +} + +// getMetricsOutput retrieves and returns the logs from the curl pod used to access the metrics endpoint. +func getMetricsOutput() string { + By("getting the curl-metrics logs") + cmd := exec.Command("kubectl", "logs", "curl-metrics", "-n", namespace) + metricsOutput, err := utils.Run(cmd) + Expect(err).NotTo(HaveOccurred(), "Failed to retrieve logs from curl pod") + Expect(metricsOutput).To(ContainSubstring("< HTTP/1.1 200 OK")) + return metricsOutput +} + +// tokenRequest is a simplified representation of the Kubernetes TokenRequest API response, +// containing only the token field that we need to extract. +type tokenRequest struct { + Status struct { + Token string `json:"token"` + } `json:"status"` +} diff --git a/test/utils/utils.go b/test/utils/utils.go new file mode 100644 index 0000000..c3d51ce --- /dev/null +++ b/test/utils/utils.go @@ -0,0 +1,251 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package utils + +import ( + "bufio" + "bytes" + "fmt" + "os" + "os/exec" + "strings" + + . "github.com/onsi/ginkgo/v2" //nolint:golint,revive +) + +const ( + prometheusOperatorVersion = "v0.77.1" + prometheusOperatorURL = "https://github.com/prometheus-operator/prometheus-operator/" + + "releases/download/%s/bundle.yaml" + + certmanagerVersion = "v1.16.0" + certmanagerURLTmpl = "https://github.com/jetstack/cert-manager/releases/download/%s/cert-manager.yaml" +) + +func warnError(err error) { + _, _ = fmt.Fprintf(GinkgoWriter, "warning: %v\n", err) +} + +// Run executes the provided command within this context +func Run(cmd *exec.Cmd) (string, error) { + dir, _ := GetProjectDir() + cmd.Dir = dir + + if err := os.Chdir(cmd.Dir); err != nil { + _, _ = fmt.Fprintf(GinkgoWriter, "chdir dir: %s\n", err) + } + + cmd.Env = append(os.Environ(), "GO111MODULE=on") + command := strings.Join(cmd.Args, " ") + _, _ = fmt.Fprintf(GinkgoWriter, "running: %s\n", command) + output, err := cmd.CombinedOutput() + if err != nil { + return string(output), fmt.Errorf("%s failed with error: (%v) %s", command, err, string(output)) + } + + return string(output), nil +} + +// InstallPrometheusOperator installs the prometheus Operator to be used to export the enabled metrics. +func InstallPrometheusOperator() error { + url := fmt.Sprintf(prometheusOperatorURL, prometheusOperatorVersion) + cmd := exec.Command("kubectl", "create", "-f", url) + _, err := Run(cmd) + return err +} + +// UninstallPrometheusOperator uninstalls the prometheus +func UninstallPrometheusOperator() { + url := fmt.Sprintf(prometheusOperatorURL, prometheusOperatorVersion) + cmd := exec.Command("kubectl", "delete", "-f", url) + if _, err := Run(cmd); err != nil { + warnError(err) + } +} + +// IsPrometheusCRDsInstalled checks if any Prometheus CRDs are installed +// by verifying the existence of key CRDs related to Prometheus. +func IsPrometheusCRDsInstalled() bool { + // List of common Prometheus CRDs + prometheusCRDs := []string{ + "prometheuses.monitoring.coreos.com", + "prometheusrules.monitoring.coreos.com", + "prometheusagents.monitoring.coreos.com", + } + + cmd := exec.Command("kubectl", "get", "crds", "-o", "custom-columns=NAME:.metadata.name") + output, err := Run(cmd) + if err != nil { + return false + } + crdList := GetNonEmptyLines(string(output)) + for _, crd := range prometheusCRDs { + for _, line := range crdList { + if strings.Contains(line, crd) { + return true + } + } + } + + return false +} + +// UninstallCertManager uninstalls the cert manager +func UninstallCertManager() { + url := fmt.Sprintf(certmanagerURLTmpl, certmanagerVersion) + cmd := exec.Command("kubectl", "delete", "-f", url) + if _, err := Run(cmd); err != nil { + warnError(err) + } +} + +// InstallCertManager installs the cert manager bundle. +func InstallCertManager() error { + url := fmt.Sprintf(certmanagerURLTmpl, certmanagerVersion) + cmd := exec.Command("kubectl", "apply", "-f", url) + if _, err := Run(cmd); err != nil { + return err + } + // Wait for cert-manager-webhook to be ready, which can take time if cert-manager + // was re-installed after uninstalling on a cluster. + cmd = exec.Command("kubectl", "wait", "deployment.apps/cert-manager-webhook", + "--for", "condition=Available", + "--namespace", "cert-manager", + "--timeout", "5m", + ) + + _, err := Run(cmd) + return err +} + +// IsCertManagerCRDsInstalled checks if any Cert Manager CRDs are installed +// by verifying the existence of key CRDs related to Cert Manager. +func IsCertManagerCRDsInstalled() bool { + // List of common Cert Manager CRDs + certManagerCRDs := []string{ + "certificates.cert-manager.io", + "issuers.cert-manager.io", + "clusterissuers.cert-manager.io", + "certificaterequests.cert-manager.io", + "orders.acme.cert-manager.io", + "challenges.acme.cert-manager.io", + } + + // Execute the kubectl command to get all CRDs + cmd := exec.Command("kubectl", "get", "crds") + output, err := Run(cmd) + if err != nil { + return false + } + + // Check if any of the Cert Manager CRDs are present + crdList := GetNonEmptyLines(string(output)) + for _, crd := range certManagerCRDs { + for _, line := range crdList { + if strings.Contains(line, crd) { + return true + } + } + } + + return false +} + +// LoadImageToKindClusterWithName loads a local docker image to the kind cluster +func LoadImageToKindClusterWithName(name string) error { + cluster := "kind" + if v, ok := os.LookupEnv("KIND_CLUSTER"); ok { + cluster = v + } + kindOptions := []string{"load", "docker-image", name, "--name", cluster} + cmd := exec.Command("kind", kindOptions...) + _, err := Run(cmd) + return err +} + +// GetNonEmptyLines converts given command output string into individual objects +// according to line breakers, and ignores the empty elements in it. +func GetNonEmptyLines(output string) []string { + var res []string + elements := strings.Split(output, "\n") + for _, element := range elements { + if element != "" { + res = append(res, element) + } + } + + return res +} + +// GetProjectDir will return the directory where the project is +func GetProjectDir() (string, error) { + wd, err := os.Getwd() + if err != nil { + return wd, err + } + wd = strings.Replace(wd, "/test/e2e", "", -1) + return wd, nil +} + +// UncommentCode searches for target in the file and remove the comment prefix +// of the target content. The target content may span multiple lines. +func UncommentCode(filename, target, prefix string) error { + // false positive + // nolint:gosec + content, err := os.ReadFile(filename) + if err != nil { + return err + } + strContent := string(content) + + idx := strings.Index(strContent, target) + if idx < 0 { + return fmt.Errorf("unable to find the code %s to be uncomment", target) + } + + out := new(bytes.Buffer) + _, err = out.Write(content[:idx]) + if err != nil { + return err + } + + scanner := bufio.NewScanner(bytes.NewBufferString(target)) + if !scanner.Scan() { + return nil + } + for { + _, err := out.WriteString(strings.TrimPrefix(scanner.Text(), prefix)) + if err != nil { + return err + } + // Avoid writing a newline in case the previous line was the last in target. + if !scanner.Scan() { + break + } + if _, err := out.WriteString("\n"); err != nil { + return err + } + } + + _, err = out.Write(content[idx+len(target):]) + if err != nil { + return err + } + // false positive + // nolint:gosec + return os.WriteFile(filename, out.Bytes(), 0644) +} From 3e184e1cefe6c7c056866f42d9e0b4cae25a445f Mon Sep 17 00:00:00 2001 From: 0x5457 <0x5457@protonmail.com> Date: Tue, 3 Dec 2024 07:16:14 +0000 Subject: [PATCH 02/22] feat: initialize definition of GPUNode and TensorFusionConnection --- PROJECT | 19 ++ api/v1/gpunode_types.go | 51 ++++ api/v1/groupversion_info.go | 36 +++ api/v1/tensorfusionconnection_types.go | 77 ++++++ api/v1/zz_generated.deepcopy.go | 224 ++++++++++++++++++ cmd/main.go | 18 ++ ...r-fusion.ai.tensor-fusion.ai_gpunodes.yaml | 86 +++++++ ...sor-fusion.ai_tensorfusionconnections.yaml | 107 +++++++++ config/crd/kustomization.yaml | 21 ++ config/crd/kustomizeconfig.yaml | 19 ++ config/default/kustomization.yaml | 2 +- config/rbac/gpunode_editor_role.yaml | 27 +++ config/rbac/gpunode_viewer_role.yaml | 23 ++ config/rbac/kustomization.yaml | 9 + config/rbac/role.yaml | 36 ++- .../tensorfusionconnection_editor_role.yaml | 27 +++ .../tensorfusionconnection_viewer_role.yaml | 23 ++ config/samples/kustomization.yaml | 5 + .../samples/tensor-fusion.ai_v1_gpunode.yaml | 9 + ...r-fusion.ai_v1_tensorfusionconnection.yaml | 9 + internal/controller/gpunode_controller.go | 63 +++++ .../controller/gpunode_controller_test.go | 84 +++++++ internal/controller/suite_test.go | 96 ++++++++ .../tensorfusionconnection_controller.go | 63 +++++ .../tensorfusionconnection_controller_test.go | 84 +++++++ 25 files changed, 1211 insertions(+), 7 deletions(-) create mode 100644 api/v1/gpunode_types.go create mode 100644 api/v1/groupversion_info.go create mode 100644 api/v1/tensorfusionconnection_types.go create mode 100644 api/v1/zz_generated.deepcopy.go create mode 100644 config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_gpunodes.yaml create mode 100644 config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_tensorfusionconnections.yaml create mode 100644 config/crd/kustomization.yaml create mode 100644 config/crd/kustomizeconfig.yaml create mode 100644 config/rbac/gpunode_editor_role.yaml create mode 100644 config/rbac/gpunode_viewer_role.yaml create mode 100644 config/rbac/tensorfusionconnection_editor_role.yaml create mode 100644 config/rbac/tensorfusionconnection_viewer_role.yaml create mode 100644 config/samples/kustomization.yaml create mode 100644 config/samples/tensor-fusion.ai_v1_gpunode.yaml create mode 100644 config/samples/tensor-fusion.ai_v1_tensorfusionconnection.yaml create mode 100644 internal/controller/gpunode_controller.go create mode 100644 internal/controller/gpunode_controller_test.go create mode 100644 internal/controller/suite_test.go create mode 100644 internal/controller/tensorfusionconnection_controller.go create mode 100644 internal/controller/tensorfusionconnection_controller_test.go diff --git a/PROJECT b/PROJECT index cb60042..80dae4a 100644 --- a/PROJECT +++ b/PROJECT @@ -7,4 +7,23 @@ layout: - go.kubebuilder.io/v4 projectName: tensor-fusion-operator repo: github.com/NexusGPU/tensor-fusion-operator +resources: +- api: + crdVersion: v1 + namespaced: true + controller: true + domain: tensor-fusion.ai + group: tensor-fusion.ai + kind: TensorFusionConnection + path: github.com/NexusGPU/tensor-fusion-operator/api/v1 + version: v1 +- api: + crdVersion: v1 + namespaced: true + controller: true + domain: tensor-fusion.ai + group: tensor-fusion.ai + kind: GPUNode + path: github.com/NexusGPU/tensor-fusion-operator/api/v1 + version: v1 version: "3" diff --git a/api/v1/gpunode_types.go b/api/v1/gpunode_types.go new file mode 100644 index 0000000..45524cd --- /dev/null +++ b/api/v1/gpunode_types.go @@ -0,0 +1,51 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// GPUNodeStatus defines the observed state of GPUNode. +type GPUNodeStatus struct { + Capacity Resource `json:"capacity"` + Used Resource `json:"used"` +} + +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status + +// GPUNode is the Schema for the gpunodes API. +type GPUNode struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Status GPUNodeStatus `json:"status,omitempty"` +} + +// +kubebuilder:object:root=true + +// GPUNodeList contains a list of GPUNode. +type GPUNodeList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []GPUNode `json:"items"` +} + +func init() { + SchemeBuilder.Register(&GPUNode{}, &GPUNodeList{}) +} diff --git a/api/v1/groupversion_info.go b/api/v1/groupversion_info.go new file mode 100644 index 0000000..9172ec6 --- /dev/null +++ b/api/v1/groupversion_info.go @@ -0,0 +1,36 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package v1 contains API Schema definitions for the tensor-fusion.ai v1 API group. +// +kubebuilder:object:generate=true +// +groupName=tensor-fusion.ai.tensor-fusion.ai +package v1 + +import ( + "k8s.io/apimachinery/pkg/runtime/schema" + "sigs.k8s.io/controller-runtime/pkg/scheme" +) + +var ( + // GroupVersion is group version used to register these objects. + GroupVersion = schema.GroupVersion{Group: "tensor-fusion.ai.tensor-fusion.ai", Version: "v1"} + + // SchemeBuilder is used to add go types to the GroupVersionKind scheme. + SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion} + + // AddToScheme adds the types in this group-version to the given scheme. + AddToScheme = SchemeBuilder.AddToScheme +) diff --git a/api/v1/tensorfusionconnection_types.go b/api/v1/tensorfusionconnection_types.go new file mode 100644 index 0000000..955f227 --- /dev/null +++ b/api/v1/tensorfusionconnection_types.go @@ -0,0 +1,77 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1 + +import ( + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +type Resource struct { + Tflops resource.Quantity `json:"tflops"` + Vram resource.Quantity `json:"vram"` +} + +type Resources struct { + Request Resource `json:"request"` + Limit Resource `json:"limit"` +} + +// TensorFusionConnectionSpec defines the desired state of TensorFusionConnection. +type TensorFusionConnectionSpec struct { + Resources Resources `json:"resources"` +} + +type TensorFusionConnectionPhase string + +// These are the valid phases of a GpuConnection. +const ( + TensorFusionConnectionPending TensorFusionConnectionPhase = "Pending" + TensorFusionConnectionRunning TensorFusionConnectionPhase = "Running" +) + +// TensorFusionConnectionStatus defines the observed state of TensorFusionConnection. +type TensorFusionConnectionStatus struct { + Phase TensorFusionConnectionPhase `json:"phase"` + ConnectionURL string `json:"connectionURL"` + QosClass string `json:"qosClass"` +} + +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status + +// TensorFusionConnection is the Schema for the tensorfusionconnections API. +type TensorFusionConnection struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec TensorFusionConnectionSpec `json:"spec,omitempty"` + Status TensorFusionConnectionStatus `json:"status,omitempty"` +} + +// +kubebuilder:object:root=true + +// TensorFusionConnectionList contains a list of TensorFusionConnection. +type TensorFusionConnectionList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []TensorFusionConnection `json:"items"` +} + +func init() { + SchemeBuilder.Register(&TensorFusionConnection{}, &TensorFusionConnectionList{}) +} diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go new file mode 100644 index 0000000..841b343 --- /dev/null +++ b/api/v1/zz_generated.deepcopy.go @@ -0,0 +1,224 @@ +//go:build !ignore_autogenerated + +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by controller-gen. DO NOT EDIT. + +package v1 + +import ( + runtime "k8s.io/apimachinery/pkg/runtime" +) + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *GPUNode) DeepCopyInto(out *GPUNode) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNode. +func (in *GPUNode) DeepCopy() *GPUNode { + if in == nil { + return nil + } + out := new(GPUNode) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *GPUNode) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *GPUNodeList) DeepCopyInto(out *GPUNodeList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]GPUNode, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeList. +func (in *GPUNodeList) DeepCopy() *GPUNodeList { + if in == nil { + return nil + } + out := new(GPUNodeList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *GPUNodeList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *GPUNodeStatus) DeepCopyInto(out *GPUNodeStatus) { + *out = *in + in.Capacity.DeepCopyInto(&out.Capacity) + in.Used.DeepCopyInto(&out.Used) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeStatus. +func (in *GPUNodeStatus) DeepCopy() *GPUNodeStatus { + if in == nil { + return nil + } + out := new(GPUNodeStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *Resource) DeepCopyInto(out *Resource) { + *out = *in + out.Tflops = in.Tflops.DeepCopy() + out.Vram = in.Vram.DeepCopy() +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Resource. +func (in *Resource) DeepCopy() *Resource { + if in == nil { + return nil + } + out := new(Resource) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *Resources) DeepCopyInto(out *Resources) { + *out = *in + in.Request.DeepCopyInto(&out.Request) + in.Limit.DeepCopyInto(&out.Limit) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Resources. +func (in *Resources) DeepCopy() *Resources { + if in == nil { + return nil + } + out := new(Resources) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TensorFusionConnection) DeepCopyInto(out *TensorFusionConnection) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + out.Status = in.Status +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TensorFusionConnection. +func (in *TensorFusionConnection) DeepCopy() *TensorFusionConnection { + if in == nil { + return nil + } + out := new(TensorFusionConnection) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *TensorFusionConnection) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TensorFusionConnectionList) DeepCopyInto(out *TensorFusionConnectionList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]TensorFusionConnection, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TensorFusionConnectionList. +func (in *TensorFusionConnectionList) DeepCopy() *TensorFusionConnectionList { + if in == nil { + return nil + } + out := new(TensorFusionConnectionList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *TensorFusionConnectionList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TensorFusionConnectionSpec) DeepCopyInto(out *TensorFusionConnectionSpec) { + *out = *in + in.Resources.DeepCopyInto(&out.Resources) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TensorFusionConnectionSpec. +func (in *TensorFusionConnectionSpec) DeepCopy() *TensorFusionConnectionSpec { + if in == nil { + return nil + } + out := new(TensorFusionConnectionSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TensorFusionConnectionStatus) DeepCopyInto(out *TensorFusionConnectionStatus) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TensorFusionConnectionStatus. +func (in *TensorFusionConnectionStatus) DeepCopy() *TensorFusionConnectionStatus { + if in == nil { + return nil + } + out := new(TensorFusionConnectionStatus) + in.DeepCopyInto(out) + return out +} diff --git a/cmd/main.go b/cmd/main.go index 8992f96..4d3856e 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -34,6 +34,9 @@ import ( "sigs.k8s.io/controller-runtime/pkg/metrics/filters" metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" "sigs.k8s.io/controller-runtime/pkg/webhook" + + tensorfusionaiv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1" + "github.com/NexusGPU/tensor-fusion-operator/internal/controller" // +kubebuilder:scaffold:imports ) @@ -45,6 +48,7 @@ var ( func init() { utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(tensorfusionaiv1.AddToScheme(scheme)) // +kubebuilder:scaffold:scheme } @@ -138,6 +142,20 @@ func main() { os.Exit(1) } + if err = (&controller.TensorFusionConnectionReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "TensorFusionConnection") + os.Exit(1) + } + if err = (&controller.GPUNodeReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "GPUNode") + os.Exit(1) + } // +kubebuilder:scaffold:builder if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { diff --git a/config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_gpunodes.yaml b/config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_gpunodes.yaml new file mode 100644 index 0000000..b3a3b46 --- /dev/null +++ b/config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_gpunodes.yaml @@ -0,0 +1,86 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.4 + name: gpunodes.tensor-fusion.ai.tensor-fusion.ai +spec: + group: tensor-fusion.ai.tensor-fusion.ai + names: + kind: GPUNode + listKind: GPUNodeList + plural: gpunodes + singular: gpunode + scope: Namespaced + versions: + - name: v1 + schema: + openAPIV3Schema: + description: GPUNode is the Schema for the gpunodes API. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + status: + description: GPUNodeStatus defines the observed state of GPUNode. + properties: + capacity: + properties: + tflops: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + vram: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - tflops + - vram + type: object + used: + properties: + tflops: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + vram: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - tflops + - vram + type: object + required: + - capacity + - used + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_tensorfusionconnections.yaml b/config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_tensorfusionconnections.yaml new file mode 100644 index 0000000..135776a --- /dev/null +++ b/config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_tensorfusionconnections.yaml @@ -0,0 +1,107 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.4 + name: tensorfusionconnections.tensor-fusion.ai.tensor-fusion.ai +spec: + group: tensor-fusion.ai.tensor-fusion.ai + names: + kind: TensorFusionConnection + listKind: TensorFusionConnectionList + plural: tensorfusionconnections + singular: tensorfusionconnection + scope: Namespaced + versions: + - name: v1 + schema: + openAPIV3Schema: + description: TensorFusionConnection is the Schema for the tensorfusionconnections + API. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: TensorFusionConnectionSpec defines the desired state of TensorFusionConnection. + properties: + resources: + properties: + limit: + properties: + tflops: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + vram: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - tflops + - vram + type: object + request: + properties: + tflops: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + vram: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - tflops + - vram + type: object + required: + - limit + - request + type: object + required: + - resources + type: object + status: + description: TensorFusionConnectionStatus defines the observed state of + TensorFusionConnection. + properties: + connectionURL: + type: string + phase: + type: string + qosClass: + type: string + required: + - connectionURL + - phase + - qosClass + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/config/crd/kustomization.yaml b/config/crd/kustomization.yaml new file mode 100644 index 0000000..127a6ba --- /dev/null +++ b/config/crd/kustomization.yaml @@ -0,0 +1,21 @@ +# This kustomization.yaml is not intended to be run by itself, +# since it depends on service name and namespace that are out of this kustomize package. +# It should be run by config/default +resources: +- bases/tensor-fusion.ai.tensor-fusion.ai_tensorfusionconnections.yaml +- bases/tensor-fusion.ai.tensor-fusion.ai_gpunodes.yaml +# +kubebuilder:scaffold:crdkustomizeresource + +patches: +# [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix. +# patches here are for enabling the conversion webhook for each CRD +# +kubebuilder:scaffold:crdkustomizewebhookpatch + +# [CERTMANAGER] To enable cert-manager, uncomment all the sections with [CERTMANAGER] prefix. +# patches here are for enabling the CA injection for each CRD +# +kubebuilder:scaffold:crdkustomizecainjectionpatch + +# [WEBHOOK] To enable webhook, uncomment the following section +# the following config is for teaching kustomize how to do kustomization for CRDs. +#configurations: +#- kustomizeconfig.yaml diff --git a/config/crd/kustomizeconfig.yaml b/config/crd/kustomizeconfig.yaml new file mode 100644 index 0000000..ec5c150 --- /dev/null +++ b/config/crd/kustomizeconfig.yaml @@ -0,0 +1,19 @@ +# This file is for teaching kustomize how to substitute name and namespace reference in CRD +nameReference: +- kind: Service + version: v1 + fieldSpecs: + - kind: CustomResourceDefinition + version: v1 + group: apiextensions.k8s.io + path: spec/conversion/webhook/clientConfig/service/name + +namespace: +- kind: CustomResourceDefinition + version: v1 + group: apiextensions.k8s.io + path: spec/conversion/webhook/clientConfig/service/namespace + create: false + +varReference: +- path: metadata/annotations diff --git a/config/default/kustomization.yaml b/config/default/kustomization.yaml index c27f571..4f303ca 100644 --- a/config/default/kustomization.yaml +++ b/config/default/kustomization.yaml @@ -15,7 +15,7 @@ namePrefix: tensor-fusion-operator- # someName: someValue resources: -#- ../crd +- ../crd - ../rbac - ../manager # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in diff --git a/config/rbac/gpunode_editor_role.yaml b/config/rbac/gpunode_editor_role.yaml new file mode 100644 index 0000000..11c1526 --- /dev/null +++ b/config/rbac/gpunode_editor_role.yaml @@ -0,0 +1,27 @@ +# permissions for end users to edit gpunodes. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/name: tensor-fusion-operator + app.kubernetes.io/managed-by: kustomize + name: gpunode-editor-role +rules: +- apiGroups: + - tensor-fusion.ai.tensor-fusion.ai + resources: + - gpunodes + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - tensor-fusion.ai.tensor-fusion.ai + resources: + - gpunodes/status + verbs: + - get diff --git a/config/rbac/gpunode_viewer_role.yaml b/config/rbac/gpunode_viewer_role.yaml new file mode 100644 index 0000000..a4808a0 --- /dev/null +++ b/config/rbac/gpunode_viewer_role.yaml @@ -0,0 +1,23 @@ +# permissions for end users to view gpunodes. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/name: tensor-fusion-operator + app.kubernetes.io/managed-by: kustomize + name: gpunode-viewer-role +rules: +- apiGroups: + - tensor-fusion.ai.tensor-fusion.ai + resources: + - gpunodes + verbs: + - get + - list + - watch +- apiGroups: + - tensor-fusion.ai.tensor-fusion.ai + resources: + - gpunodes/status + verbs: + - get diff --git a/config/rbac/kustomization.yaml b/config/rbac/kustomization.yaml index 5619aa0..0bb7cfe 100644 --- a/config/rbac/kustomization.yaml +++ b/config/rbac/kustomization.yaml @@ -18,3 +18,12 @@ resources: - metrics_auth_role.yaml - metrics_auth_role_binding.yaml - metrics_reader_role.yaml +# For each CRD, "Editor" and "Viewer" roles are scaffolded by +# default, aiding admins in cluster management. Those roles are +# not used by the Project itself. You can comment the following lines +# if you do not want those helpers be installed with your Project. +- gpunode_editor_role.yaml +- gpunode_viewer_role.yaml +- tensorfusionconnection_editor_role.yaml +- tensorfusionconnection_viewer_role.yaml + diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 7454ff6..b5d3369 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -1,11 +1,35 @@ +--- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - labels: - app.kubernetes.io/name: tensor-fusion-operator - app.kubernetes.io/managed-by: kustomize name: manager-role rules: -- apiGroups: [""] - resources: ["pods"] - verbs: ["get", "list", "watch"] +- apiGroups: + - tensor-fusion.ai.tensor-fusion.ai + resources: + - gpunodes + - tensorfusionconnections + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - tensor-fusion.ai.tensor-fusion.ai + resources: + - gpunodes/finalizers + - tensorfusionconnections/finalizers + verbs: + - update +- apiGroups: + - tensor-fusion.ai.tensor-fusion.ai + resources: + - gpunodes/status + - tensorfusionconnections/status + verbs: + - get + - patch + - update diff --git a/config/rbac/tensorfusionconnection_editor_role.yaml b/config/rbac/tensorfusionconnection_editor_role.yaml new file mode 100644 index 0000000..d7627ed --- /dev/null +++ b/config/rbac/tensorfusionconnection_editor_role.yaml @@ -0,0 +1,27 @@ +# permissions for end users to edit tensorfusionconnections. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/name: tensor-fusion-operator + app.kubernetes.io/managed-by: kustomize + name: tensorfusionconnection-editor-role +rules: +- apiGroups: + - tensor-fusion.ai.tensor-fusion.ai + resources: + - tensorfusionconnections + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - tensor-fusion.ai.tensor-fusion.ai + resources: + - tensorfusionconnections/status + verbs: + - get diff --git a/config/rbac/tensorfusionconnection_viewer_role.yaml b/config/rbac/tensorfusionconnection_viewer_role.yaml new file mode 100644 index 0000000..498b61e --- /dev/null +++ b/config/rbac/tensorfusionconnection_viewer_role.yaml @@ -0,0 +1,23 @@ +# permissions for end users to view tensorfusionconnections. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/name: tensor-fusion-operator + app.kubernetes.io/managed-by: kustomize + name: tensorfusionconnection-viewer-role +rules: +- apiGroups: + - tensor-fusion.ai.tensor-fusion.ai + resources: + - tensorfusionconnections + verbs: + - get + - list + - watch +- apiGroups: + - tensor-fusion.ai.tensor-fusion.ai + resources: + - tensorfusionconnections/status + verbs: + - get diff --git a/config/samples/kustomization.yaml b/config/samples/kustomization.yaml new file mode 100644 index 0000000..022e343 --- /dev/null +++ b/config/samples/kustomization.yaml @@ -0,0 +1,5 @@ +## Append samples of your project ## +resources: +- tensor-fusion.ai_v1_tensorfusionconnection.yaml +- tensor-fusion.ai_v1_gpunode.yaml +# +kubebuilder:scaffold:manifestskustomizesamples diff --git a/config/samples/tensor-fusion.ai_v1_gpunode.yaml b/config/samples/tensor-fusion.ai_v1_gpunode.yaml new file mode 100644 index 0000000..0957bdb --- /dev/null +++ b/config/samples/tensor-fusion.ai_v1_gpunode.yaml @@ -0,0 +1,9 @@ +apiVersion: tensor-fusion.ai.tensor-fusion.ai/v1 +kind: GPUNode +metadata: + labels: + app.kubernetes.io/name: tensor-fusion-operator + app.kubernetes.io/managed-by: kustomize + name: gpunode-sample +spec: + # TODO(user): Add fields here diff --git a/config/samples/tensor-fusion.ai_v1_tensorfusionconnection.yaml b/config/samples/tensor-fusion.ai_v1_tensorfusionconnection.yaml new file mode 100644 index 0000000..91c2a95 --- /dev/null +++ b/config/samples/tensor-fusion.ai_v1_tensorfusionconnection.yaml @@ -0,0 +1,9 @@ +apiVersion: tensor-fusion.ai.tensor-fusion.ai/v1 +kind: TensorFusionConnection +metadata: + labels: + app.kubernetes.io/name: tensor-fusion-operator + app.kubernetes.io/managed-by: kustomize + name: tensorfusionconnection-sample +spec: + # TODO(user): Add fields here diff --git a/internal/controller/gpunode_controller.go b/internal/controller/gpunode_controller.go new file mode 100644 index 0000000..70fbd9b --- /dev/null +++ b/internal/controller/gpunode_controller.go @@ -0,0 +1,63 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + + "k8s.io/apimachinery/pkg/runtime" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + + tensorfusionaiv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1" +) + +// GPUNodeReconciler reconciles a GPUNode object +type GPUNodeReconciler struct { + client.Client + Scheme *runtime.Scheme +} + +// +kubebuilder:rbac:groups=tensor-fusion.ai.tensor-fusion.ai,resources=gpunodes,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=tensor-fusion.ai.tensor-fusion.ai,resources=gpunodes/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=tensor-fusion.ai.tensor-fusion.ai,resources=gpunodes/finalizers,verbs=update + +// Reconcile is part of the main kubernetes reconciliation loop which aims to +// move the current state of the cluster closer to the desired state. +// TODO(user): Modify the Reconcile function to compare the state specified by +// the GPUNode object against the actual cluster state, and then +// perform operations to make the cluster state reflect the state specified by +// the user. +// +// For more details, check Reconcile and its Result here: +// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.19.1/pkg/reconcile +func (r *GPUNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + _ = log.FromContext(ctx) + + // TODO(user): your logic here + + return ctrl.Result{}, nil +} + +// SetupWithManager sets up the controller with the Manager. +func (r *GPUNodeReconciler) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&tensorfusionaiv1.GPUNode{}). + Named("gpunode"). + Complete(r) +} diff --git a/internal/controller/gpunode_controller_test.go b/internal/controller/gpunode_controller_test.go new file mode 100644 index 0000000..8cf0c89 --- /dev/null +++ b/internal/controller/gpunode_controller_test.go @@ -0,0 +1,84 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + tensorfusionaiv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1" +) + +var _ = Describe("GPUNode Controller", func() { + Context("When reconciling a resource", func() { + const resourceName = "test-resource" + + ctx := context.Background() + + typeNamespacedName := types.NamespacedName{ + Name: resourceName, + Namespace: "default", // TODO(user):Modify as needed + } + gpunode := &tensorfusionaiv1.GPUNode{} + + BeforeEach(func() { + By("creating the custom resource for the Kind GPUNode") + err := k8sClient.Get(ctx, typeNamespacedName, gpunode) + if err != nil && errors.IsNotFound(err) { + resource := &tensorfusionaiv1.GPUNode{ + ObjectMeta: metav1.ObjectMeta{ + Name: resourceName, + Namespace: "default", + }, + // TODO(user): Specify other spec details if needed. + } + Expect(k8sClient.Create(ctx, resource)).To(Succeed()) + } + }) + + AfterEach(func() { + // TODO(user): Cleanup logic after each test, like removing the resource instance. + resource := &tensorfusionaiv1.GPUNode{} + err := k8sClient.Get(ctx, typeNamespacedName, resource) + Expect(err).NotTo(HaveOccurred()) + + By("Cleanup the specific resource instance GPUNode") + Expect(k8sClient.Delete(ctx, resource)).To(Succeed()) + }) + It("should successfully reconcile the resource", func() { + By("Reconciling the created resource") + controllerReconciler := &GPUNodeReconciler{ + Client: k8sClient, + Scheme: k8sClient.Scheme(), + } + + _, err := controllerReconciler.Reconcile(ctx, reconcile.Request{ + NamespacedName: typeNamespacedName, + }) + Expect(err).NotTo(HaveOccurred()) + // TODO(user): Add more specific assertions depending on your controller's reconciliation logic. + // Example: If you expect a certain status condition after reconciliation, verify it here. + }) + }) +}) diff --git a/internal/controller/suite_test.go b/internal/controller/suite_test.go new file mode 100644 index 0000000..33f944a --- /dev/null +++ b/internal/controller/suite_test.go @@ -0,0 +1,96 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + "fmt" + "path/filepath" + "runtime" + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/envtest" + logf "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + + tensorfusionaiv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1" + // +kubebuilder:scaffold:imports +) + +// These tests use Ginkgo (BDD-style Go testing framework). Refer to +// http://onsi.github.io/ginkgo/ to learn more about Ginkgo. + +var cfg *rest.Config +var k8sClient client.Client +var testEnv *envtest.Environment +var ctx context.Context +var cancel context.CancelFunc + +func TestControllers(t *testing.T) { + RegisterFailHandler(Fail) + + RunSpecs(t, "Controller Suite") +} + +var _ = BeforeSuite(func() { + logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true))) + + ctx, cancel = context.WithCancel(context.TODO()) + + By("bootstrapping test environment") + testEnv = &envtest.Environment{ + CRDDirectoryPaths: []string{filepath.Join("..", "..", "config", "crd", "bases")}, + ErrorIfCRDPathMissing: true, + + // The BinaryAssetsDirectory is only required if you want to run the tests directly + // without call the makefile target test. If not informed it will look for the + // default path defined in controller-runtime which is /usr/local/kubebuilder/. + // Note that you must have the required binaries setup under the bin directory to perform + // the tests directly. When we run make test it will be setup and used automatically. + BinaryAssetsDirectory: filepath.Join("..", "..", "bin", "k8s", + fmt.Sprintf("1.31.0-%s-%s", runtime.GOOS, runtime.GOARCH)), + } + + var err error + // cfg is defined in this file globally. + cfg, err = testEnv.Start() + Expect(err).NotTo(HaveOccurred()) + Expect(cfg).NotTo(BeNil()) + + err = tensorfusionaiv1.AddToScheme(scheme.Scheme) + Expect(err).NotTo(HaveOccurred()) + + // +kubebuilder:scaffold:scheme + + k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme}) + Expect(err).NotTo(HaveOccurred()) + Expect(k8sClient).NotTo(BeNil()) + +}) + +var _ = AfterSuite(func() { + By("tearing down the test environment") + cancel() + err := testEnv.Stop() + Expect(err).NotTo(HaveOccurred()) +}) diff --git a/internal/controller/tensorfusionconnection_controller.go b/internal/controller/tensorfusionconnection_controller.go new file mode 100644 index 0000000..84e6f50 --- /dev/null +++ b/internal/controller/tensorfusionconnection_controller.go @@ -0,0 +1,63 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + + "k8s.io/apimachinery/pkg/runtime" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + + tensorfusionaiv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1" +) + +// TensorFusionConnectionReconciler reconciles a TensorFusionConnection object +type TensorFusionConnectionReconciler struct { + client.Client + Scheme *runtime.Scheme +} + +// +kubebuilder:rbac:groups=tensor-fusion.ai.tensor-fusion.ai,resources=tensorfusionconnections,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=tensor-fusion.ai.tensor-fusion.ai,resources=tensorfusionconnections/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=tensor-fusion.ai.tensor-fusion.ai,resources=tensorfusionconnections/finalizers,verbs=update + +// Reconcile is part of the main kubernetes reconciliation loop which aims to +// move the current state of the cluster closer to the desired state. +// TODO(user): Modify the Reconcile function to compare the state specified by +// the TensorFusionConnection object against the actual cluster state, and then +// perform operations to make the cluster state reflect the state specified by +// the user. +// +// For more details, check Reconcile and its Result here: +// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.19.1/pkg/reconcile +func (r *TensorFusionConnectionReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + _ = log.FromContext(ctx) + + // TODO(user): your logic here + + return ctrl.Result{}, nil +} + +// SetupWithManager sets up the controller with the Manager. +func (r *TensorFusionConnectionReconciler) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&tensorfusionaiv1.TensorFusionConnection{}). + Named("tensorfusionconnection"). + Complete(r) +} diff --git a/internal/controller/tensorfusionconnection_controller_test.go b/internal/controller/tensorfusionconnection_controller_test.go new file mode 100644 index 0000000..6ad2872 --- /dev/null +++ b/internal/controller/tensorfusionconnection_controller_test.go @@ -0,0 +1,84 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + tensorfusionaiv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1" +) + +var _ = Describe("TensorFusionConnection Controller", func() { + Context("When reconciling a resource", func() { + const resourceName = "test-resource" + + ctx := context.Background() + + typeNamespacedName := types.NamespacedName{ + Name: resourceName, + Namespace: "default", // TODO(user):Modify as needed + } + tensorfusionconnection := &tensorfusionaiv1.TensorFusionConnection{} + + BeforeEach(func() { + By("creating the custom resource for the Kind TensorFusionConnection") + err := k8sClient.Get(ctx, typeNamespacedName, tensorfusionconnection) + if err != nil && errors.IsNotFound(err) { + resource := &tensorfusionaiv1.TensorFusionConnection{ + ObjectMeta: metav1.ObjectMeta{ + Name: resourceName, + Namespace: "default", + }, + // TODO(user): Specify other spec details if needed. + } + Expect(k8sClient.Create(ctx, resource)).To(Succeed()) + } + }) + + AfterEach(func() { + // TODO(user): Cleanup logic after each test, like removing the resource instance. + resource := &tensorfusionaiv1.TensorFusionConnection{} + err := k8sClient.Get(ctx, typeNamespacedName, resource) + Expect(err).NotTo(HaveOccurred()) + + By("Cleanup the specific resource instance TensorFusionConnection") + Expect(k8sClient.Delete(ctx, resource)).To(Succeed()) + }) + It("should successfully reconcile the resource", func() { + By("Reconciling the created resource") + controllerReconciler := &TensorFusionConnectionReconciler{ + Client: k8sClient, + Scheme: k8sClient.Scheme(), + } + + _, err := controllerReconciler.Reconcile(ctx, reconcile.Request{ + NamespacedName: typeNamespacedName, + }) + Expect(err).NotTo(HaveOccurred()) + // TODO(user): Add more specific assertions depending on your controller's reconciliation logic. + // Example: If you expect a certain status condition after reconciliation, verify it here. + }) + }) +}) From 08e8e570f79f6c8c00c5ff7dcab26d87f06b0207 Mon Sep 17 00:00:00 2001 From: 0x5457 <0x5457@protonmail.com> Date: Tue, 3 Dec 2024 07:34:51 +0000 Subject: [PATCH 03/22] feat(scheduler): implement a simple first-fit GPU node scheduler --- api/v1/gpunode_types.go | 4 +- api/v1/zz_generated.deepcopy.go | 2 +- internal/scheduler/naive.go | 58 +++++++++++++ internal/scheduler/naive_test.go | 144 +++++++++++++++++++++++++++++++ internal/scheduler/scheduler.go | 20 +++++ 5 files changed, 225 insertions(+), 3 deletions(-) create mode 100644 internal/scheduler/naive.go create mode 100644 internal/scheduler/naive_test.go create mode 100644 internal/scheduler/scheduler.go diff --git a/api/v1/gpunode_types.go b/api/v1/gpunode_types.go index 45524cd..24fe843 100644 --- a/api/v1/gpunode_types.go +++ b/api/v1/gpunode_types.go @@ -22,8 +22,8 @@ import ( // GPUNodeStatus defines the observed state of GPUNode. type GPUNodeStatus struct { - Capacity Resource `json:"capacity"` - Used Resource `json:"used"` + Capacity Resource `json:"capacity"` + Available Resource `json:"available"` } // +kubebuilder:object:root=true diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go index 841b343..28bd614 100644 --- a/api/v1/zz_generated.deepcopy.go +++ b/api/v1/zz_generated.deepcopy.go @@ -86,7 +86,7 @@ func (in *GPUNodeList) DeepCopyObject() runtime.Object { func (in *GPUNodeStatus) DeepCopyInto(out *GPUNodeStatus) { *out = *in in.Capacity.DeepCopyInto(&out.Capacity) - in.Used.DeepCopyInto(&out.Used) + in.Available.DeepCopyInto(&out.Available) } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeStatus. diff --git a/internal/scheduler/naive.go b/internal/scheduler/naive.go new file mode 100644 index 0000000..24f95ff --- /dev/null +++ b/internal/scheduler/naive.go @@ -0,0 +1,58 @@ +package scheduler + +import ( + "fmt" + "sync" + + v1 "github.com/NexusGPU/tensor-fusion-operator/api/v1" +) + +// NaiveScheduler implements a simple scheduling strategy +type NaiveScheduler struct { + sync.RWMutex + nodes map[string]*v1.GPUNode +} + +// NewNaiveScheduler creates a new NaiveScheduler +func NewNaiveScheduler() *NaiveScheduler { + return &NaiveScheduler{ + nodes: make(map[string]*v1.GPUNode), + } +} + +// Schedule implements Scheduler interface +func (s *NaiveScheduler) Schedule(request v1.Resource) (*v1.GPUNode, error) { + s.RLock() + defer s.RUnlock() + + // Simple strategy: return the first node that has enough resources + for _, node := range s.nodes { + if node.Status.Available.Tflops.Cmp(request.Tflops) >= 0 && + node.Status.Available.Vram.Cmp(request.Vram) >= 0 { + return node, nil + } + } + + return nil, fmt.Errorf("no suitable node found for request: %v", request) +} + +// OnAdd implements Scheduler interface +func (s *NaiveScheduler) OnAdd(node *v1.GPUNode) { + s.Lock() + defer s.Unlock() + s.nodes[node.Name] = node +} + +// OnUpdate implements Scheduler interface +func (s *NaiveScheduler) OnUpdate(oldNode, newNode *v1.GPUNode) { + s.Lock() + defer s.Unlock() + s.nodes[newNode.Name] = newNode +} + +// OnDelete implements Scheduler interface +func (s *NaiveScheduler) OnDelete(node *v1.GPUNode) { + s.Lock() + defer s.Unlock() + delete(s.nodes, node.Name) +} diff --git a/internal/scheduler/naive_test.go b/internal/scheduler/naive_test.go new file mode 100644 index 0000000..eac7740 --- /dev/null +++ b/internal/scheduler/naive_test.go @@ -0,0 +1,144 @@ +package scheduler + +import ( + "testing" + + v1 "github.com/NexusGPU/tensor-fusion-operator/api/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func createGPUNode(name string, tflops, vram string) *v1.GPUNode { + return &v1.GPUNode{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + }, + Status: v1.GPUNodeStatus{ + Available: v1.Resource{ + Tflops: resource.MustParse(tflops), + Vram: resource.MustParse(vram), + }, + }, + } +} + +func createRequest(tflops, vram string) v1.Resource { + return v1.Resource{ + Tflops: resource.MustParse(tflops), + Vram: resource.MustParse(vram), + } +} + +func TestNaiveScheduler_Schedule(t *testing.T) { + tests := []struct { + name string + nodes []*v1.GPUNode + request v1.Resource + wantNode string + wantError bool + }{ + { + name: "simple match", + nodes: []*v1.GPUNode{ + createGPUNode("node1", "100", "16Gi"), + }, + request: createRequest("50", "8Gi"), + wantNode: "node1", + wantError: false, + }, + { + name: "no nodes", + nodes: []*v1.GPUNode{}, + request: createRequest("50", "8Gi"), + wantNode: "", + wantError: true, + }, + { + name: "insufficient resources", + nodes: []*v1.GPUNode{ + createGPUNode("node1", "40", "16Gi"), + }, + request: createRequest("50", "8Gi"), + wantNode: "", + wantError: true, + }, + { + name: "multiple nodes, first fit", + nodes: []*v1.GPUNode{ + createGPUNode("node1", "40", "16Gi"), + createGPUNode("node2", "100", "32Gi"), + }, + request: createRequest("50", "8Gi"), + wantNode: "node2", + wantError: false, + }, + { + name: "exact match", + nodes: []*v1.GPUNode{ + createGPUNode("node1", "50", "8Gi"), + }, + request: createRequest("50", "8Gi"), + wantNode: "node1", + wantError: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + s := NewNaiveScheduler() + + // Add nodes + for _, node := range tt.nodes { + s.OnAdd(node) + } + + // Try to schedule + got, err := s.Schedule(tt.request) + + // Check error + if (err != nil) != tt.wantError { + t.Errorf("Schedule() error = %v, wantError %v", err, tt.wantError) + return + } + + // Check result + if !tt.wantError { + if got == nil { + t.Error("Schedule() returned nil node when error not expected") + return + } + if got.Name != tt.wantNode { + t.Errorf("Schedule() got node = %v, want %v", got.Name, tt.wantNode) + } + } + }) + } +} + +func TestNaiveScheduler_NodeOperations(t *testing.T) { + s := NewNaiveScheduler() + node1 := createGPUNode("node1", "100", "16Gi") + request := createRequest("50", "8Gi") + + // Test OnAdd + s.OnAdd(node1) + got, err := s.Schedule(request) + if err != nil || got.Name != "node1" { + t.Errorf("After OnAdd: Schedule() got = %v, want node1", got) + } + + // Test OnUpdate + node1Updated := createGPUNode("node1", "40", "16Gi") + s.OnUpdate(node1, node1Updated) + got, err = s.Schedule(request) + if err == nil { + t.Error("After OnUpdate: Schedule() should fail with insufficient resources") + } + + // Test OnDelete + s.OnDelete(node1Updated) + got, err = s.Schedule(request) + if err == nil { + t.Error("After OnDelete: Schedule() should fail with no nodes") + } +} diff --git a/internal/scheduler/scheduler.go b/internal/scheduler/scheduler.go new file mode 100644 index 0000000..6f9d9b5 --- /dev/null +++ b/internal/scheduler/scheduler.go @@ -0,0 +1,20 @@ +package scheduler + +import ( + v1 "github.com/NexusGPU/tensor-fusion-operator/api/v1" +) + +// Scheduler is the interface that wraps the scheduling methods +type Scheduler interface { + // Schedule takes a Resource Request and returns the pointer of the GPU node + // that can accommodate the request. If no suitable node is found, it returns + // an nil pointer and an error. + Schedule(request v1.Resource) (*v1.GPUNode, error) + + // OnAdd is called when a new node is added + OnAdd(node *v1.GPUNode) + // OnUpdate is called when a node is modified + OnUpdate(oldNode, newNode *v1.GPUNode) + // OnDelete is called when a node is deleted + OnDelete(node *v1.GPUNode) +} From f60c82840e01f9d852e24a32810badb20a434de2 Mon Sep 17 00:00:00 2001 From: 0x5457 <0x5457@protonmail.com> Date: Tue, 3 Dec 2024 09:55:30 +0000 Subject: [PATCH 04/22] feat: implement gpunode and tensorfusionconnection controller --- api/v1/gpunode_types.go | 1 + api/v1/zz_generated.deepcopy.go | 5 + cmd/main.go | 4 +- ...r-fusion.ai.tensor-fusion.ai_gpunodes.yaml | 11 ++- internal/controller/gpunode_controller.go | 52 +++++++---- .../tensorfusionconnection_controller.go | 92 ++++++++++++++++--- internal/scheduler/naive.go | 24 ++--- internal/scheduler/naive_test.go | 84 ++++++++++------- internal/scheduler/scheduler.go | 10 +- internal/worker/worker.go | 9 ++ 10 files changed, 213 insertions(+), 79 deletions(-) create mode 100644 internal/worker/worker.go diff --git a/api/v1/gpunode_types.go b/api/v1/gpunode_types.go index 24fe843..6a752ba 100644 --- a/api/v1/gpunode_types.go +++ b/api/v1/gpunode_types.go @@ -24,6 +24,7 @@ import ( type GPUNodeStatus struct { Capacity Resource `json:"capacity"` Available Resource `json:"available"` + Devices []string `json:"devices"` } // +kubebuilder:object:root=true diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go index 28bd614..d899ad7 100644 --- a/api/v1/zz_generated.deepcopy.go +++ b/api/v1/zz_generated.deepcopy.go @@ -87,6 +87,11 @@ func (in *GPUNodeStatus) DeepCopyInto(out *GPUNodeStatus) { *out = *in in.Capacity.DeepCopyInto(&out.Capacity) in.Available.DeepCopyInto(&out.Available) + if in.Devices != nil { + in, out := &in.Devices, &out.Devices + *out = make([]string, len(*in)) + copy(*out, *in) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeStatus. diff --git a/cmd/main.go b/cmd/main.go index 4d3856e..ac14a23 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -17,6 +17,7 @@ limitations under the License. package main import ( + "context" "crypto/tls" "flag" "os" @@ -142,6 +143,7 @@ func main() { os.Exit(1) } + ctx := context.Background() if err = (&controller.TensorFusionConnectionReconciler{ Client: mgr.GetClient(), Scheme: mgr.GetScheme(), @@ -152,7 +154,7 @@ func main() { if err = (&controller.GPUNodeReconciler{ Client: mgr.GetClient(), Scheme: mgr.GetScheme(), - }).SetupWithManager(mgr); err != nil { + }).SetupWithManager(ctx, mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "GPUNode") os.Exit(1) } diff --git a/config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_gpunodes.yaml b/config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_gpunodes.yaml index b3a3b46..4829b1f 100644 --- a/config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_gpunodes.yaml +++ b/config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_gpunodes.yaml @@ -39,7 +39,7 @@ spec: status: description: GPUNodeStatus defines the observed state of GPUNode. properties: - capacity: + available: properties: tflops: anyOf: @@ -57,7 +57,7 @@ spec: - tflops - vram type: object - used: + capacity: properties: tflops: anyOf: @@ -75,9 +75,14 @@ spec: - tflops - vram type: object + devices: + items: + type: string + type: array required: + - available - capacity - - used + - devices type: object type: object served: true diff --git a/internal/controller/gpunode_controller.go b/internal/controller/gpunode_controller.go index 70fbd9b..265f840 100644 --- a/internal/controller/gpunode_controller.go +++ b/internal/controller/gpunode_controller.go @@ -22,15 +22,18 @@ import ( "k8s.io/apimachinery/pkg/runtime" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/predicate" - tensorfusionaiv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1" + tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1" + scheduler "github.com/NexusGPU/tensor-fusion-operator/internal/scheduler" ) // GPUNodeReconciler reconciles a GPUNode object type GPUNodeReconciler struct { client.Client - Scheme *runtime.Scheme + Scheme *runtime.Scheme + Scheduler scheduler.Scheduler } // +kubebuilder:rbac:groups=tensor-fusion.ai.tensor-fusion.ai,resources=gpunodes,verbs=get;list;watch;create;update;patch;delete @@ -39,25 +42,42 @@ type GPUNodeReconciler struct { // Reconcile is part of the main kubernetes reconciliation loop which aims to // move the current state of the cluster closer to the desired state. -// TODO(user): Modify the Reconcile function to compare the state specified by -// the GPUNode object against the actual cluster state, and then -// perform operations to make the cluster state reflect the state specified by -// the user. -// -// For more details, check Reconcile and its Result here: -// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.19.1/pkg/reconcile func (r *GPUNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - _ = log.FromContext(ctx) - - // TODO(user): your logic here - + // TOOD: Calculate tflops and update capacity here return ctrl.Result{}, nil } // SetupWithManager sets up the controller with the Manager. -func (r *GPUNodeReconciler) SetupWithManager(mgr ctrl.Manager) error { +func (r *GPUNodeReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager) error { + // List all existing GPUNodes and add them to scheduler + existingNodes := &tfv1.GPUNodeList{} + if err := r.List(ctx, existingNodes); err != nil { + return err + } + + // Add all existing nodes to scheduler + for i := range existingNodes.Items { + r.Scheduler.OnAdd(&existingNodes.Items[i]) + } + return ctrl.NewControllerManagedBy(mgr). - For(&tensorfusionaiv1.GPUNode{}). + For(&tfv1.GPUNode{}). Named("gpunode"). + WithEventFilter( + predicate.Funcs{ + CreateFunc: func(e event.CreateEvent) bool { + r.Scheduler.OnAdd(e.Object.(*tfv1.GPUNode)) + return true + }, + UpdateFunc: func(e event.UpdateEvent) bool { + r.Scheduler.OnUpdate(e.ObjectOld.(*tfv1.GPUNode), e.ObjectNew.(*tfv1.GPUNode)) + return true + }, + DeleteFunc: func(e event.DeleteEvent) bool { + r.Scheduler.OnDelete(e.Object.(*tfv1.GPUNode)) + return true + }, + }, + ). Complete(r) } diff --git a/internal/controller/tensorfusionconnection_controller.go b/internal/controller/tensorfusionconnection_controller.go index 84e6f50..b9223f4 100644 --- a/internal/controller/tensorfusionconnection_controller.go +++ b/internal/controller/tensorfusionconnection_controller.go @@ -19,18 +19,23 @@ package controller import ( "context" + "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/util/retry" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" - tensorfusionaiv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1" + tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1" + scheduler "github.com/NexusGPU/tensor-fusion-operator/internal/scheduler" + "github.com/NexusGPU/tensor-fusion-operator/internal/worker" ) // TensorFusionConnectionReconciler reconciles a TensorFusionConnection object type TensorFusionConnectionReconciler struct { client.Client - Scheme *runtime.Scheme + Scheme *runtime.Scheme + Scheduler scheduler.Scheduler } // +kubebuilder:rbac:groups=tensor-fusion.ai.tensor-fusion.ai,resources=tensorfusionconnections,verbs=get;list;watch;create;update;patch;delete @@ -39,25 +44,88 @@ type TensorFusionConnectionReconciler struct { // Reconcile is part of the main kubernetes reconciliation loop which aims to // move the current state of the cluster closer to the desired state. -// TODO(user): Modify the Reconcile function to compare the state specified by -// the TensorFusionConnection object against the actual cluster state, and then -// perform operations to make the cluster state reflect the state specified by -// the user. -// -// For more details, check Reconcile and its Result here: -// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.19.1/pkg/reconcile func (r *TensorFusionConnectionReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - _ = log.FromContext(ctx) + log := log.FromContext(ctx) - // TODO(user): your logic here + // Get the TensorFusionConnection object + connection := &tfv1.TensorFusionConnection{} + if err := r.Get(ctx, req.NamespacedName, connection); err != nil { + if errors.IsNotFound(err) { + // Object not found, could have been deleted after reconcile request, return without error + return ctrl.Result{}, nil + } + log.Error(err, "Failed to get TensorFusionConnection") + return ctrl.Result{}, err + } + + var node *tfv1.GPUNode + // If status is not set or pending, try to schedule + if connection.Status.Phase == "" || connection.Status.Phase == tfv1.TensorFusionConnectionPending { + // Try to get an available node from scheduler + node, err := r.Scheduler.Schedule(connection.Spec.Resources.Request) + if err != nil { + log.Error(err, "Failed to schedule connection") + connection.Status.Phase = tfv1.TensorFusionConnectionPending + } else if node != nil { + connection.Status.Phase = tfv1.TensorFusionConnectionRunning + connection.Status.ConnectionURL = worker.GenerateConnectionURL(node, connection) + } else { + connection.Status.Phase = tfv1.TensorFusionConnectionPending + } + } + + if err := r.MustUpdateStatus(ctx, connection, node); err != nil { + return ctrl.Result{}, err + } return ctrl.Result{}, nil } +func (r *TensorFusionConnectionReconciler) MustUpdateStatus(ctx context.Context, connection *tfv1.TensorFusionConnection, gpuNode *tfv1.GPUNode) error { + return retry.RetryOnConflict(retry.DefaultBackoff, func() error { + // Get the latest version of the connection + latestConnection := &tfv1.TensorFusionConnection{} + if err := r.Get(ctx, client.ObjectKey{ + Name: connection.Name, + Namespace: connection.Namespace, + }, latestConnection); err != nil { + return err + } + + // Update the status fields we care about + latestConnection.Status.Phase = connection.Status.Phase + latestConnection.Status.ConnectionURL = connection.Status.ConnectionURL + + // Update the connection status + if err := r.Status().Update(ctx, latestConnection); err != nil { + return err + } + + if gpuNode != nil { + // Get the latest version of the node + latestNode := &tfv1.GPUNode{} + + if err := r.Get(ctx, client.ObjectKey{ + Name: gpuNode.Name, + Namespace: gpuNode.Namespace, + }, latestNode); err != nil { + return err + } + + // Update the status fields we care about + latestNode.Status.Available = gpuNode.Status.Available + if err := r.Status().Update(ctx, latestNode); err != nil { + return err + } + } + return nil + }) +} + // SetupWithManager sets up the controller with the Manager. func (r *TensorFusionConnectionReconciler) SetupWithManager(mgr ctrl.Manager) error { return ctrl.NewControllerManagedBy(mgr). - For(&tensorfusionaiv1.TensorFusionConnection{}). + For(&tfv1.TensorFusionConnection{}). Named("tensorfusionconnection"). Complete(r) } diff --git a/internal/scheduler/naive.go b/internal/scheduler/naive.go index 24f95ff..8d9a4d3 100644 --- a/internal/scheduler/naive.go +++ b/internal/scheduler/naive.go @@ -4,54 +4,56 @@ import ( "fmt" "sync" - v1 "github.com/NexusGPU/tensor-fusion-operator/api/v1" + tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1" ) // NaiveScheduler implements a simple scheduling strategy type NaiveScheduler struct { - sync.RWMutex - nodes map[string]*v1.GPUNode + sync.Mutex + nodes map[string]*tfv1.GPUNode } // NewNaiveScheduler creates a new NaiveScheduler func NewNaiveScheduler() *NaiveScheduler { return &NaiveScheduler{ - nodes: make(map[string]*v1.GPUNode), + nodes: make(map[string]*tfv1.GPUNode), } } // Schedule implements Scheduler interface -func (s *NaiveScheduler) Schedule(request v1.Resource) (*v1.GPUNode, error) { - s.RLock() - defer s.RUnlock() +func (s *NaiveScheduler) Schedule(request tfv1.Resource) (*tfv1.GPUNode, error) { + s.Lock() + defer s.Unlock() // Simple strategy: return the first node that has enough resources for _, node := range s.nodes { if node.Status.Available.Tflops.Cmp(request.Tflops) >= 0 && node.Status.Available.Vram.Cmp(request.Vram) >= 0 { + // Update the node's available resources + node.Status.Available.Tflops.Sub(request.Tflops) + node.Status.Available.Vram.Sub(request.Vram) return node, nil } } - return nil, fmt.Errorf("no suitable node found for request: %v", request) } // OnAdd implements Scheduler interface -func (s *NaiveScheduler) OnAdd(node *v1.GPUNode) { +func (s *NaiveScheduler) OnAdd(node *tfv1.GPUNode) { s.Lock() defer s.Unlock() s.nodes[node.Name] = node } // OnUpdate implements Scheduler interface -func (s *NaiveScheduler) OnUpdate(oldNode, newNode *v1.GPUNode) { +func (s *NaiveScheduler) OnUpdate(oldNode, newNode *tfv1.GPUNode) { s.Lock() defer s.Unlock() s.nodes[newNode.Name] = newNode } // OnDelete implements Scheduler interface -func (s *NaiveScheduler) OnDelete(node *v1.GPUNode) { +func (s *NaiveScheduler) OnDelete(node *tfv1.GPUNode) { s.Lock() defer s.Unlock() delete(s.nodes, node.Name) diff --git a/internal/scheduler/naive_test.go b/internal/scheduler/naive_test.go index eac7740..be1affd 100644 --- a/internal/scheduler/naive_test.go +++ b/internal/scheduler/naive_test.go @@ -3,18 +3,18 @@ package scheduler import ( "testing" - v1 "github.com/NexusGPU/tensor-fusion-operator/api/v1" + tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) -func createGPUNode(name string, tflops, vram string) *v1.GPUNode { - return &v1.GPUNode{ +func createGPUNode(name string, tflops, vram string) *tfv1.GPUNode { + return &tfv1.GPUNode{ ObjectMeta: metav1.ObjectMeta{ Name: name, }, - Status: v1.GPUNodeStatus{ - Available: v1.Resource{ + Status: tfv1.GPUNodeStatus{ + Available: tfv1.Resource{ Tflops: resource.MustParse(tflops), Vram: resource.MustParse(vram), }, @@ -22,8 +22,8 @@ func createGPUNode(name string, tflops, vram string) *v1.GPUNode { } } -func createRequest(tflops, vram string) v1.Resource { - return v1.Resource{ +func createRequest(tflops, vram string) tfv1.Resource { + return tfv1.Resource{ Tflops: resource.MustParse(tflops), Vram: resource.MustParse(vram), } @@ -31,31 +31,35 @@ func createRequest(tflops, vram string) v1.Resource { func TestNaiveScheduler_Schedule(t *testing.T) { tests := []struct { - name string - nodes []*v1.GPUNode - request v1.Resource - wantNode string - wantError bool + name string + nodes []*tfv1.GPUNode + request tfv1.Resource + wantNode string + wantError bool + wantRemainingTflops string + wantRemainingVram string }{ { name: "simple match", - nodes: []*v1.GPUNode{ + nodes: []*tfv1.GPUNode{ createGPUNode("node1", "100", "16Gi"), }, - request: createRequest("50", "8Gi"), - wantNode: "node1", - wantError: false, + request: createRequest("50", "8Gi"), + wantNode: "node1", + wantError: false, + wantRemainingTflops: "50", + wantRemainingVram: "8Gi", }, { - name: "no nodes", - nodes: []*v1.GPUNode{}, + name: "no nodes", + nodes: []*tfv1.GPUNode{}, request: createRequest("50", "8Gi"), wantNode: "", wantError: true, }, { name: "insufficient resources", - nodes: []*v1.GPUNode{ + nodes: []*tfv1.GPUNode{ createGPUNode("node1", "40", "16Gi"), }, request: createRequest("50", "8Gi"), @@ -64,29 +68,33 @@ func TestNaiveScheduler_Schedule(t *testing.T) { }, { name: "multiple nodes, first fit", - nodes: []*v1.GPUNode{ + nodes: []*tfv1.GPUNode{ createGPUNode("node1", "40", "16Gi"), createGPUNode("node2", "100", "32Gi"), }, - request: createRequest("50", "8Gi"), - wantNode: "node2", - wantError: false, + request: createRequest("50", "8Gi"), + wantNode: "node2", + wantError: false, + wantRemainingTflops: "50", + wantRemainingVram: "24Gi", }, { name: "exact match", - nodes: []*v1.GPUNode{ + nodes: []*tfv1.GPUNode{ createGPUNode("node1", "50", "8Gi"), }, - request: createRequest("50", "8Gi"), - wantNode: "node1", - wantError: false, + request: createRequest("50", "8Gi"), + wantNode: "node1", + wantError: false, + wantRemainingTflops: "0", + wantRemainingVram: "0", }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { s := NewNaiveScheduler() - + // Add nodes for _, node := range tt.nodes { s.OnAdd(node) @@ -94,7 +102,7 @@ func TestNaiveScheduler_Schedule(t *testing.T) { // Try to schedule got, err := s.Schedule(tt.request) - + // Check error if (err != nil) != tt.wantError { t.Errorf("Schedule() error = %v, wantError %v", err, tt.wantError) @@ -110,6 +118,20 @@ func TestNaiveScheduler_Schedule(t *testing.T) { if got.Name != tt.wantNode { t.Errorf("Schedule() got node = %v, want %v", got.Name, tt.wantNode) } + + // Check remaining resources + if tt.wantRemainingTflops != "" { + wantTflops := resource.MustParse(tt.wantRemainingTflops) + if got.Status.Available.Tflops.Cmp(wantTflops) != 0 { + t.Errorf("Remaining Tflops = %v, want %v", got.Status.Available.Tflops.String(), tt.wantRemainingTflops) + } + } + if tt.wantRemainingVram != "" { + wantVram := resource.MustParse(tt.wantRemainingVram) + if got.Status.Available.Vram.Cmp(wantVram) != 0 { + t.Errorf("Remaining Vram = %v, want %v", got.Status.Available.Vram.String(), tt.wantRemainingVram) + } + } } }) } @@ -130,14 +152,14 @@ func TestNaiveScheduler_NodeOperations(t *testing.T) { // Test OnUpdate node1Updated := createGPUNode("node1", "40", "16Gi") s.OnUpdate(node1, node1Updated) - got, err = s.Schedule(request) + _, err = s.Schedule(request) if err == nil { t.Error("After OnUpdate: Schedule() should fail with insufficient resources") } // Test OnDelete s.OnDelete(node1Updated) - got, err = s.Schedule(request) + _, err = s.Schedule(request) if err == nil { t.Error("After OnDelete: Schedule() should fail with no nodes") } diff --git a/internal/scheduler/scheduler.go b/internal/scheduler/scheduler.go index 6f9d9b5..47ac776 100644 --- a/internal/scheduler/scheduler.go +++ b/internal/scheduler/scheduler.go @@ -1,7 +1,7 @@ package scheduler import ( - v1 "github.com/NexusGPU/tensor-fusion-operator/api/v1" + tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1" ) // Scheduler is the interface that wraps the scheduling methods @@ -9,12 +9,12 @@ type Scheduler interface { // Schedule takes a Resource Request and returns the pointer of the GPU node // that can accommodate the request. If no suitable node is found, it returns // an nil pointer and an error. - Schedule(request v1.Resource) (*v1.GPUNode, error) + Schedule(request tfv1.Resource) (*tfv1.GPUNode, error) // OnAdd is called when a new node is added - OnAdd(node *v1.GPUNode) + OnAdd(node *tfv1.GPUNode) // OnUpdate is called when a node is modified - OnUpdate(oldNode, newNode *v1.GPUNode) + OnUpdate(oldNode, newNode *tfv1.GPUNode) // OnDelete is called when a node is deleted - OnDelete(node *v1.GPUNode) + OnDelete(node *tfv1.GPUNode) } diff --git a/internal/worker/worker.go b/internal/worker/worker.go new file mode 100644 index 0000000..74b93d3 --- /dev/null +++ b/internal/worker/worker.go @@ -0,0 +1,9 @@ +package worker + +import ( + tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1" +) + +func GenerateConnectionURL(_node *tfv1.GPUNode, _connection *tfv1.TensorFusionConnection) string { + return "TODO://" +} From 58e3406bd597c9bb7c34e2834a8c442069eb9a9f Mon Sep 17 00:00:00 2001 From: 0x5457 <0x5457@protonmail.com> Date: Tue, 3 Dec 2024 10:57:30 +0000 Subject: [PATCH 05/22] feat: add HTTP server with connection router --- cmd/main.go | 26 ++++- go.mod | 20 ++++ go.sum | 52 ++++++++++ internal/server/router/connection.go | 145 +++++++++++++++++++++++++++ internal/server/server.go | 19 ++++ 5 files changed, 261 insertions(+), 1 deletion(-) create mode 100644 internal/server/router/connection.go create mode 100644 internal/server/server.go diff --git a/cmd/main.go b/cmd/main.go index ac14a23..d2d6e85 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -30,6 +30,7 @@ import ( utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/healthz" "sigs.k8s.io/controller-runtime/pkg/log/zap" "sigs.k8s.io/controller-runtime/pkg/metrics/filters" @@ -38,6 +39,8 @@ import ( tensorfusionaiv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1" "github.com/NexusGPU/tensor-fusion-operator/internal/controller" + "github.com/NexusGPU/tensor-fusion-operator/internal/server" + "github.com/NexusGPU/tensor-fusion-operator/internal/server/router" // +kubebuilder:scaffold:imports ) @@ -119,7 +122,8 @@ func main() { // this setup is not recommended for production. } - mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ + kc := ctrl.GetConfigOrDie() + mgr, err := ctrl.NewManager(kc, ctrl.Options{ Scheme: scheme, Metrics: metricsServerOptions, WebhookServer: webhookServer, @@ -169,6 +173,26 @@ func main() { os.Exit(1) } + // Initialize and start the HTTP server + client, err := client.NewWithWatch(kc, client.Options{Scheme: scheme}) + if err != nil { + setupLog.Error(err, "failed to create client with watch") + os.Exit(1) + } + connectionRouter, err := router.NewConnectionRouter(ctx, client) + if err != nil { + setupLog.Error(err, "failed to create connection router") + os.Exit(1) + } + httpServer := server.NewHTTPServer(connectionRouter) + go func() { + err := httpServer.Run() + if err != nil { + setupLog.Error(err, "problem running HTTP server") + os.Exit(1) + } + }() + setupLog.Info("starting manager") if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { setupLog.Error(err, "problem running manager") diff --git a/go.mod b/go.mod index c5f3936..0b333c1 100644 --- a/go.mod +++ b/go.mod @@ -3,6 +3,8 @@ module github.com/NexusGPU/tensor-fusion-operator go 1.22.0 require ( + github.com/gin-contrib/gzip v1.0.1 + github.com/gin-gonic/gin v1.10.0 github.com/onsi/ginkgo/v2 v2.19.0 github.com/onsi/gomega v1.33.1 k8s.io/apimachinery v0.31.0 @@ -15,21 +17,31 @@ require ( github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/blang/semver/v4 v4.0.0 // indirect + github.com/bytedance/sonic v1.11.6 // indirect + github.com/bytedance/sonic/loader v0.1.1 // indirect github.com/cenkalti/backoff/v4 v4.3.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/cloudwego/base64x v0.1.4 // indirect + github.com/cloudwego/iasm v0.2.0 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/emicklei/go-restful/v3 v3.11.0 // indirect github.com/evanphx/json-patch/v5 v5.9.0 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fsnotify/fsnotify v1.7.0 // indirect github.com/fxamacker/cbor/v2 v2.7.0 // indirect + github.com/gabriel-vasile/mimetype v1.4.3 // indirect + github.com/gin-contrib/sse v0.1.0 // indirect github.com/go-logr/logr v1.4.2 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/go-logr/zapr v1.3.0 // indirect github.com/go-openapi/jsonpointer v0.19.6 // indirect github.com/go-openapi/jsonreference v0.20.2 // indirect github.com/go-openapi/swag v0.22.4 // indirect + github.com/go-playground/locales v0.14.1 // indirect + github.com/go-playground/universal-translator v0.18.1 // indirect + github.com/go-playground/validator/v10 v10.20.0 // indirect github.com/go-task/slim-sprig/v3 v3.0.0 // indirect + github.com/goccy/go-json v0.10.2 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/golang/protobuf v1.5.4 // indirect @@ -44,10 +56,14 @@ require ( github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect + github.com/klauspost/cpuid/v2 v2.2.7 // indirect + github.com/leodido/go-urn v1.4.0 // indirect github.com/mailru/easyjson v0.7.7 // indirect + github.com/mattn/go-isatty v0.0.20 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/pelletier/go-toml/v2 v2.2.2 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/prometheus/client_golang v1.19.1 // indirect github.com/prometheus/client_model v0.6.1 // indirect @@ -56,6 +72,8 @@ require ( github.com/spf13/cobra v1.8.1 // indirect github.com/spf13/pflag v1.0.5 // indirect github.com/stoewer/go-strcase v1.2.0 // indirect + github.com/twitchyliquid64/golang-asm v0.15.1 // indirect + github.com/ugorji/go/codec v1.2.12 // indirect github.com/x448/float16 v0.8.4 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0 // indirect go.opentelemetry.io/otel v1.28.0 // indirect @@ -67,6 +85,8 @@ require ( go.opentelemetry.io/proto/otlp v1.3.1 // indirect go.uber.org/multierr v1.11.0 // indirect go.uber.org/zap v1.26.0 // indirect + golang.org/x/arch v0.8.0 // indirect + golang.org/x/crypto v0.24.0 // indirect golang.org/x/exp v0.0.0-20230515195305-f3d0a9c9a5cc // indirect golang.org/x/net v0.26.0 // indirect golang.org/x/oauth2 v0.21.0 // indirect diff --git a/go.sum b/go.sum index 0958667..96b016b 100644 --- a/go.sum +++ b/go.sum @@ -6,10 +6,18 @@ github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= +github.com/bytedance/sonic v1.11.6 h1:oUp34TzMlL+OY1OUWxHqsdkgC/Zfc85zGqw9siXjrc0= +github.com/bytedance/sonic v1.11.6/go.mod h1:LysEHSvpvDySVdC2f87zGWf6CIKJcAvqab1ZaiQtds4= +github.com/bytedance/sonic/loader v0.1.1 h1:c+e5Pt1k/cy5wMveRDyk2X4B9hF4g7an8N3zCYjJFNM= +github.com/bytedance/sonic/loader v0.1.1/go.mod h1:ncP89zfokxS5LZrJxl5z0UJcsk4M4yY2JpfqGeCtNLU= github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/cloudwego/base64x v0.1.4 h1:jwCgWpFanWmN8xoIUHa2rtzmkd5J2plF/dnLS6Xd/0Y= +github.com/cloudwego/base64x v0.1.4/go.mod h1:0zlkT4Wn5C6NdauXdJRhSKRlJvmclQ1hhJgA0rcu/8w= +github.com/cloudwego/iasm v0.2.0 h1:1KNIy1I1H9hNNFEEH3DVnI4UujN+1zjpuk6gwHLTssg= +github.com/cloudwego/iasm v0.2.0/go.mod h1:8rXZaNYT2n95jn+zTI1sDr+IgcD2GVs0nlbbQPiEFhY= github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -28,6 +36,14 @@ github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nos github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= github.com/fxamacker/cbor/v2 v2.7.0 h1:iM5WgngdRBanHcxugY4JySA0nk1wZorNOpTgCMedv5E= github.com/fxamacker/cbor/v2 v2.7.0/go.mod h1:pxXPTn3joSm21Gbwsv0w9OSA2y1HFR9qXEeXQVeNoDQ= +github.com/gabriel-vasile/mimetype v1.4.3 h1:in2uUcidCuFcDKtdcBxlR0rJ1+fsokWf+uqxgUFjbI0= +github.com/gabriel-vasile/mimetype v1.4.3/go.mod h1:d8uq/6HKRL6CGdk+aubisF/M5GcPfT7nKyLpA0lbSSk= +github.com/gin-contrib/gzip v1.0.1 h1:HQ8ENHODeLY7a4g1Au/46Z92bdGFl74OhxcZble9WJE= +github.com/gin-contrib/gzip v1.0.1/go.mod h1:njt428fdUNRvjuJf16tZMYZ2Yl+WQB53X5wmhDwXvC4= +github.com/gin-contrib/sse v0.1.0 h1:Y/yl/+YNO8GZSjAhjMsSuLt29uWRFHdHYUb5lYOV9qE= +github.com/gin-contrib/sse v0.1.0/go.mod h1:RHrZQHXnP2xjPF+u1gW/2HnVO7nvIa9PG3Gm+fLHvGI= +github.com/gin-gonic/gin v1.10.0 h1:nTuyha1TYqgedzytsKYqna+DfLos46nTv2ygFy86HFU= +github.com/gin-gonic/gin v1.10.0/go.mod h1:4PMNQiOhvDRa013RKVbsiNwoyezlm2rm0uX/T7kzp5Y= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= @@ -42,8 +58,18 @@ github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14= github.com/go-openapi/swag v0.22.4 h1:QLMzNJnMGPRNDCbySlcj1x01tzU8/9LTTL9hZZZogBU= github.com/go-openapi/swag v0.22.4/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14= +github.com/go-playground/assert/v2 v2.2.0 h1:JvknZsQTYeFEAhQwI4qEt9cyV5ONwRHC+lYKSsYSR8s= +github.com/go-playground/assert/v2 v2.2.0/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4= +github.com/go-playground/locales v0.14.1 h1:EWaQ/wswjilfKLTECiXz7Rh+3BjFhfDFKv/oXslEjJA= +github.com/go-playground/locales v0.14.1/go.mod h1:hxrqLVvrK65+Rwrd5Fc6F2O76J/NuW9t0sjnWqG1slY= +github.com/go-playground/universal-translator v0.18.1 h1:Bcnm0ZwsGyWbCzImXv+pAJnYK9S473LQFuzCbDbfSFY= +github.com/go-playground/universal-translator v0.18.1/go.mod h1:xekY+UJKNuX9WP91TpwSH2VMlDf28Uj24BCp08ZFTUY= +github.com/go-playground/validator/v10 v10.20.0 h1:K9ISHbSaI0lyB2eWMPJo+kOS/FBExVwjEviJTixqxL8= +github.com/go-playground/validator/v10 v10.20.0/go.mod h1:dbuPbCMFw/DrkbEynArYaCwl3amGuJotoKCe95atGMM= github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= +github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU= +github.com/goccy/go-json v0.10.2/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= @@ -76,6 +102,10 @@ github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnr github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= +github.com/klauspost/cpuid/v2 v2.2.7 h1:ZWSB3igEs+d0qvnxR/ZBzXVmxkgt8DdzP6m9pfuVLDM= +github.com/klauspost/cpuid/v2 v2.2.7/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws= +github.com/knz/go-libedit v1.10.1/go.mod h1:MZTVkCWyz0oBc7JOWP3wNAzd002ZbM/5hgShxwh4x8M= github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= @@ -83,8 +113,12 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/leodido/go-urn v1.4.0 h1:WT9HwE9SGECu3lg4d/dIA+jxlljEa1/ffXKmRjqdmIQ= +github.com/leodido/go-urn v1.4.0/go.mod h1:bvxc+MVxLKB4z00jd1z+Dvzr47oO32F/QSNjSBOlFxI= github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= +github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= +github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= @@ -96,6 +130,8 @@ github.com/onsi/ginkgo/v2 v2.19.0 h1:9Cnnf7UHo57Hy3k6/m5k3dRfGTMXGvxhHFvkDTCTpvA github.com/onsi/ginkgo/v2 v2.19.0/go.mod h1:rlwLi9PilAFJ8jCg9UE1QP6VBpd6/xj3SRC0d6TU0To= github.com/onsi/gomega v1.33.1 h1:dsYjIxxSR755MDmKVsaFQTE22ChNBcuuTWgkUDSubOk= github.com/onsi/gomega v1.33.1/go.mod h1:U4R44UsT+9eLIaYRB2a5qajjtQYn0hauxvRm16AVYg0= +github.com/pelletier/go-toml/v2 v2.2.2 h1:aYUidT7k73Pcl9nb2gScu7NSrKCSHIDE89b3+6Wq+LM= +github.com/pelletier/go-toml/v2 v2.2.2/go.mod h1:1t835xjRzz80PqgE6HHgN2JOsmgYu/h4qDAS4n929Rs= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= @@ -121,13 +157,20 @@ github.com/stoewer/go-strcase v1.2.0/go.mod h1:IBiWB2sKIp3wVVQ3Y035++gc+knqhUQag github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS4MhqMhdFk5YI= +github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08= +github.com/ugorji/go/codec v1.2.12 h1:9LC83zGrHhuUA9l16C9AHXAqEV/2wBQ4nkvumAE65EE= +github.com/ugorji/go/codec v1.2.12/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg= github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= @@ -154,9 +197,14 @@ go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= go.uber.org/zap v1.26.0 h1:sI7k6L95XOKS281NhVKOFCUNIvv9e0w4BF8N3u+tCRo= go.uber.org/zap v1.26.0/go.mod h1:dtElttAiwGvoJ/vj4IwHBS/gXsEu/pZ50mUIRWuG0so= +golang.org/x/arch v0.0.0-20210923205945-b76863e36670/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8= +golang.org/x/arch v0.8.0 h1:3wRIsP3pM4yUptoR96otTUOXI367OS0+c9eeRi9doIc= +golang.org/x/arch v0.8.0/go.mod h1:FEVrYAQjsQXMVJ1nsMoVVXPZg6p2JE2mx8psSWTDQys= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/crypto v0.24.0 h1:mnl8DM0o513X8fdIkmyFE/5hTYxbwYOjDS/+rK6qpRI= +golang.org/x/crypto v0.24.0/go.mod h1:Z1PMYSOR5nyMcyAVAIQSKCDwalqy85Aqn1x3Ws4L5DM= golang.org/x/exp v0.0.0-20230515195305-f3d0a9c9a5cc h1:mCRnTeVUjcrhlRmO0VK8a6k6Rrf6TF9htwo2pJVSjIU= golang.org/x/exp v0.0.0-20230515195305-f3d0a9c9a5cc/go.mod h1:V1LtkGg67GoY2N1AnLN78QLrzxkLyJw7RJb1gzOOz9w= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= @@ -177,6 +225,8 @@ golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws= golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.21.0 h1:WVXCp+/EBEHOj53Rvu+7KiT/iElMrO8ACK16SMZ3jaA= @@ -239,6 +289,8 @@ k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 h1:BZqlfIlq5YbRMFko6/PM7F k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340/go.mod h1:yD4MZYeKMBwQKVht279WycxKyM84kkAx2DPrTXaeb98= k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 h1:pUdcCO1Lk/tbT5ztQWOBi5HBgbBP1J8+AsQnQCKsi8A= k8s.io/utils v0.0.0-20240711033017-18e509b52bc8/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +nullprogram.com/x/optparse v1.0.0/go.mod h1:KdyPE+Igbe0jQUrVfMqDMeJQIJZEuyV7pjYmp6pbG50= +rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.30.3 h1:2770sDpzrjjsAtVhSeUFseziht227YAWYHLGNM8QPwY= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.30.3/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw= sigs.k8s.io/controller-runtime v0.19.1 h1:Son+Q40+Be3QWb+niBXAg2vFiYWolDjjRfO8hn/cxOk= diff --git a/internal/server/router/connection.go b/internal/server/router/connection.go new file mode 100644 index 0000000..bdc3c3e --- /dev/null +++ b/internal/server/router/connection.go @@ -0,0 +1,145 @@ +package router + +import ( + "context" + "fmt" + "sync" + + tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1" + "github.com/gin-gonic/gin" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/watch" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +type ConnectionRouter struct { + watcher *connectionWatcher +} + +func NewConnectionRouter(ctx context.Context, client client.WithWatch) (*ConnectionRouter, error) { + watcher, err := newConnectionWatcher(ctx, client) + if err != nil { + return nil, fmt.Errorf("failed to create connection watcher: %w", err) + } + return &ConnectionRouter{watcher: watcher}, nil +} + +func (cr *ConnectionRouter) Get(ctx *gin.Context) { + name := ctx.Query("name") + namespace := ctx.Query("namespace") + + req := types.NamespacedName{Name: name, Namespace: namespace} + conn := cr.watcher.get(ctx, req) + if conn == nil { + ctx.JSON(404, gin.H{"error": "connection not found"}) + return + } + + // Subscribe to connection updates + ch, cancelFunc := cr.watcher.subscribe(req) + defer cancelFunc() + + // Wait for connection updates + for conn := range ch { + if conn.Status.Phase == tfv1.TensorFusionConnectionRunning { + ctx.JSON(200, conn.Status.ConnectionURL) + return + } + } +} + +type connectionChannel chan *tfv1.TensorFusionConnection +type connectionSet map[connectionChannel]struct{} +type connectionSubscribers map[types.NamespacedName]connectionSet + +type connectionWatcher struct { + client client.WithWatch + + mu sync.RWMutex + subs connectionSubscribers +} + +func newConnectionWatcher(ctx context.Context, client client.WithWatch) (*connectionWatcher, error) { + cw := &connectionWatcher{ + client: client, + subs: make(connectionSubscribers), + } + watcher, err := cw.client.Watch(ctx, &tfv1.TensorFusionConnectionList{}) + if err != nil { + return nil, fmt.Errorf("failed to watch connections: %w", err) + } + go cw.watchConnections(ctx, watcher) + return cw, nil +} + +func (cw *connectionWatcher) get(ctx context.Context, req types.NamespacedName) *tfv1.TensorFusionConnection { + conn := &tfv1.TensorFusionConnection{} + if err := cw.client.Get(ctx, req, conn); err != nil { + return nil + } + return conn +} + +// Subscribe returns a channel that will be closed when the connection is deleted +func (cw *connectionWatcher) subscribe(req types.NamespacedName) (connectionChannel, func()) { + ch := make(connectionChannel, 1) + + cw.mu.Lock() + if _, exists := cw.subs[req]; !exists { + cw.subs[req] = make(connectionSet) + } + cw.subs[req][ch] = struct{}{} + cw.mu.Unlock() + + cancelFunc := func() { + cw.mu.Lock() + defer cw.mu.Unlock() + + if chans, exists := cw.subs[req]; exists { + delete(chans, ch) + close(ch) + + // If no more subscribers, remove the key + if len(chans) == 0 { + delete(cw.subs, req) + } + } + } + + return ch, cancelFunc +} + +func (cw *connectionWatcher) watchConnections(ctx context.Context, watcher watch.Interface) { + // Watch for changes + defer watcher.Stop() + for { + select { + case <-ctx.Done(): + return + case event, ok := <-watcher.ResultChan(): + if !ok { + return + } + + conn, ok := event.Object.(*tfv1.TensorFusionConnection) + if !ok { + continue + } + + // Get the list of subscribers for this connection + cw.mu.RLock() + key := types.NamespacedName{Name: conn.Name, Namespace: conn.Namespace} + if subscribers, exists := cw.subs[key]; exists { + // Copy subscribers to avoid holding lock during channel send + for ch := range subscribers { + select { + case ch <- conn: + default: + // Skip if channel is full + } + } + } + cw.mu.RUnlock() + } + } +} diff --git a/internal/server/server.go b/internal/server/server.go new file mode 100644 index 0000000..fa2995c --- /dev/null +++ b/internal/server/server.go @@ -0,0 +1,19 @@ +package server + +import ( + "github.com/NexusGPU/tensor-fusion-operator/internal/server/router" + "github.com/gin-contrib/gzip" + "github.com/gin-gonic/gin" +) + +func NewHTTPServer( + cr *router.ConnectionRouter, +) *gin.Engine { + r := gin.New() + r.Use(gzip.Gzip(gzip.DefaultCompression)) + r.Use(gin.Recovery()) + + apiGroup := r.Group("/api") + apiGroup.GET("/connection", cr.Get) + return r +} From 022b2713ac0ed490edb9c647d6d8462528473cb0 Mon Sep 17 00:00:00 2001 From: 0x5457 <0x5457@protonmail.com> Date: Tue, 3 Dec 2024 11:00:45 +0000 Subject: [PATCH 06/22] fix typo --- internal/controller/gpunode_controller.go | 2 +- test/e2e/e2e_test.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/controller/gpunode_controller.go b/internal/controller/gpunode_controller.go index 265f840..bab8ff5 100644 --- a/internal/controller/gpunode_controller.go +++ b/internal/controller/gpunode_controller.go @@ -43,7 +43,7 @@ type GPUNodeReconciler struct { // Reconcile is part of the main kubernetes reconciliation loop which aims to // move the current state of the cluster closer to the desired state. func (r *GPUNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - // TOOD: Calculate tflops and update capacity here + // TODO: Calculate tflops and update capacity here return ctrl.Result{}, nil } diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index a218480..aeac7d0 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -278,7 +278,7 @@ func serviceAccountToken() (string, error) { // Parse the JSON output to extract the token var token tokenRequest - err = json.Unmarshal([]byte(output), &token) + err = json.Unmarshal(output, &token) g.Expect(err).NotTo(HaveOccurred()) out = token.Status.Token From b5a3b0a7e154fc5941d3ff16be936e12e474d5cb Mon Sep 17 00:00:00 2001 From: 0x5457 <0x5457@protonmail.com> Date: Wed, 4 Dec 2024 08:05:26 +0000 Subject: [PATCH 07/22] chore: simplify error messages --- internal/server/router/connection.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/server/router/connection.go b/internal/server/router/connection.go index bdc3c3e..1abad0e 100644 --- a/internal/server/router/connection.go +++ b/internal/server/router/connection.go @@ -19,7 +19,7 @@ type ConnectionRouter struct { func NewConnectionRouter(ctx context.Context, client client.WithWatch) (*ConnectionRouter, error) { watcher, err := newConnectionWatcher(ctx, client) if err != nil { - return nil, fmt.Errorf("failed to create connection watcher: %w", err) + return nil, fmt.Errorf("create connection watcher: %w", err) } return &ConnectionRouter{watcher: watcher}, nil } @@ -66,7 +66,7 @@ func newConnectionWatcher(ctx context.Context, client client.WithWatch) (*connec } watcher, err := cw.client.Watch(ctx, &tfv1.TensorFusionConnectionList{}) if err != nil { - return nil, fmt.Errorf("failed to watch connections: %w", err) + return nil, fmt.Errorf("watch connections: %w", err) } go cw.watchConnections(ctx, watcher) return cw, nil From 289d7c6a12b1e818b09694d3246dfc5c9dba1d0f Mon Sep 17 00:00:00 2001 From: 0x5457 <0x5457@protonmail.com> Date: Thu, 5 Dec 2024 07:21:47 +0000 Subject: [PATCH 08/22] feat: add pod mutation webhook --- PROJECT | 8 + cmd/main.go | 11 + config/certmanager/certificate.yaml | 35 +++ config/certmanager/kustomization.yaml | 5 + config/certmanager/kustomizeconfig.yaml | 8 + config/crd/kustomization.yaml | 4 +- config/crd/patches/cainjection_in_pods.yaml | 7 + config/crd/patches/webhook_in_pods.yaml | 16 ++ config/default/kustomization.yaml | 4 +- config/default/manager_webhook_patch.yaml | 26 ++ .../network-policy/allow-webhook-traffic.yaml | 26 ++ config/network-policy/kustomization.yaml | 1 + config/webhook/kustomization.yaml | 6 + config/webhook/kustomizeconfig.yaml | 22 ++ config/webhook/service.yaml | 15 ++ go.mod | 2 +- internal/config/config.go | 16 ++ internal/webhook/v1/pod_webhook.go | 226 ++++++++++++++++++ internal/webhook/v1/pod_webhook_test.go | 55 +++++ test/e2e/e2e_test.go | 10 + 20 files changed, 498 insertions(+), 5 deletions(-) create mode 100644 config/certmanager/certificate.yaml create mode 100644 config/certmanager/kustomization.yaml create mode 100644 config/certmanager/kustomizeconfig.yaml create mode 100644 config/crd/patches/cainjection_in_pods.yaml create mode 100644 config/crd/patches/webhook_in_pods.yaml create mode 100644 config/default/manager_webhook_patch.yaml create mode 100644 config/network-policy/allow-webhook-traffic.yaml create mode 100644 config/webhook/kustomization.yaml create mode 100644 config/webhook/kustomizeconfig.yaml create mode 100644 config/webhook/service.yaml create mode 100644 internal/config/config.go create mode 100644 internal/webhook/v1/pod_webhook.go create mode 100644 internal/webhook/v1/pod_webhook_test.go diff --git a/PROJECT b/PROJECT index 80dae4a..cde7c4f 100644 --- a/PROJECT +++ b/PROJECT @@ -26,4 +26,12 @@ resources: kind: GPUNode path: github.com/NexusGPU/tensor-fusion-operator/api/v1 version: v1 +- core: true + group: core + kind: Pod + path: k8s.io/api/core/v1 + version: v1 + webhooks: + conversion: true + webhookVersion: v1 version: "3" diff --git a/cmd/main.go b/cmd/main.go index d2d6e85..7f7bf2b 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -38,9 +38,11 @@ import ( "sigs.k8s.io/controller-runtime/pkg/webhook" tensorfusionaiv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1" + "github.com/NexusGPU/tensor-fusion-operator/internal/config" "github.com/NexusGPU/tensor-fusion-operator/internal/controller" "github.com/NexusGPU/tensor-fusion-operator/internal/server" "github.com/NexusGPU/tensor-fusion-operator/internal/server/router" + webhookcorev1 "github.com/NexusGPU/tensor-fusion-operator/internal/webhook/v1" // +kubebuilder:scaffold:imports ) @@ -148,6 +150,7 @@ func main() { } ctx := context.Background() + config := config.NewDefaultConfig() if err = (&controller.TensorFusionConnectionReconciler{ Client: mgr.GetClient(), Scheme: mgr.GetScheme(), @@ -162,6 +165,14 @@ func main() { setupLog.Error(err, "unable to create controller", "controller", "GPUNode") os.Exit(1) } + + // nolint:goconst + if os.Getenv("ENABLE_WEBHOOKS") != "false" { + if err = webhookcorev1.SetupPodWebhookWithManager(mgr, &config.PodMutator); err != nil { + setupLog.Error(err, "unable to create webhook", "webhook", "Pod") + os.Exit(1) + } + } // +kubebuilder:scaffold:builder if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { diff --git a/config/certmanager/certificate.yaml b/config/certmanager/certificate.yaml new file mode 100644 index 0000000..7fc10a8 --- /dev/null +++ b/config/certmanager/certificate.yaml @@ -0,0 +1,35 @@ +# The following manifests contain a self-signed issuer CR and a certificate CR. +# More document can be found at https://docs.cert-manager.io +# WARNING: Targets CertManager v1.0. Check https://cert-manager.io/docs/installation/upgrading/ for breaking changes. +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + labels: + app.kubernetes.io/name: tensor-fusion-operator + app.kubernetes.io/managed-by: kustomize + name: selfsigned-issuer + namespace: system +spec: + selfSigned: {} +--- +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + labels: + app.kubernetes.io/name: certificate + app.kubernetes.io/instance: serving-cert + app.kubernetes.io/component: certificate + app.kubernetes.io/created-by: tensor-fusion-operator + app.kubernetes.io/part-of: tensor-fusion-operator + app.kubernetes.io/managed-by: kustomize + name: serving-cert # this name should match the one appeared in kustomizeconfig.yaml + namespace: system +spec: + # SERVICE_NAME and SERVICE_NAMESPACE will be substituted by kustomize + dnsNames: + - SERVICE_NAME.SERVICE_NAMESPACE.svc + - SERVICE_NAME.SERVICE_NAMESPACE.svc.cluster.local + issuerRef: + kind: Issuer + name: selfsigned-issuer + secretName: webhook-server-cert # this secret will not be prefixed, since it's not managed by kustomize diff --git a/config/certmanager/kustomization.yaml b/config/certmanager/kustomization.yaml new file mode 100644 index 0000000..bebea5a --- /dev/null +++ b/config/certmanager/kustomization.yaml @@ -0,0 +1,5 @@ +resources: +- certificate.yaml + +configurations: +- kustomizeconfig.yaml diff --git a/config/certmanager/kustomizeconfig.yaml b/config/certmanager/kustomizeconfig.yaml new file mode 100644 index 0000000..cf6f89e --- /dev/null +++ b/config/certmanager/kustomizeconfig.yaml @@ -0,0 +1,8 @@ +# This configuration is for teaching kustomize how to update name ref substitution +nameReference: +- kind: Issuer + group: cert-manager.io + fieldSpecs: + - kind: Certificate + group: cert-manager.io + path: spec/issuerRef/name diff --git a/config/crd/kustomization.yaml b/config/crd/kustomization.yaml index 127a6ba..86141ab 100644 --- a/config/crd/kustomization.yaml +++ b/config/crd/kustomization.yaml @@ -17,5 +17,5 @@ patches: # [WEBHOOK] To enable webhook, uncomment the following section # the following config is for teaching kustomize how to do kustomization for CRDs. -#configurations: -#- kustomizeconfig.yaml +configurations: +- kustomizeconfig.yaml diff --git a/config/crd/patches/cainjection_in_pods.yaml b/config/crd/patches/cainjection_in_pods.yaml new file mode 100644 index 0000000..b1ab830 --- /dev/null +++ b/config/crd/patches/cainjection_in_pods.yaml @@ -0,0 +1,7 @@ +# The following patch adds a directive for certmanager to inject CA into the CRD +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + cert-manager.io/inject-ca-from: CERTIFICATE_NAMESPACE/CERTIFICATE_NAME + name: pods.core diff --git a/config/crd/patches/webhook_in_pods.yaml b/config/crd/patches/webhook_in_pods.yaml new file mode 100644 index 0000000..8fa5d25 --- /dev/null +++ b/config/crd/patches/webhook_in_pods.yaml @@ -0,0 +1,16 @@ +# The following patch enables a conversion webhook for the CRD +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: pods.core +spec: + conversion: + strategy: Webhook + webhook: + clientConfig: + service: + namespace: system + name: webhook-service + path: /convert + conversionReviewVersions: + - v1 diff --git a/config/default/kustomization.yaml b/config/default/kustomization.yaml index 4f303ca..abb7ff0 100644 --- a/config/default/kustomization.yaml +++ b/config/default/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ../manager # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in # crd/kustomization.yaml -#- ../webhook +- ../webhook # [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER'. 'WEBHOOK' components are required. #- ../certmanager # [PROMETHEUS] To enable prometheus monitor, uncomment all sections with 'PROMETHEUS'. @@ -43,7 +43,7 @@ patches: # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in # crd/kustomization.yaml -#- path: manager_webhook_patch.yaml +- path: manager_webhook_patch.yaml # [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER' prefix. # Uncomment the following replacements to add the cert-manager CA injection annotations diff --git a/config/default/manager_webhook_patch.yaml b/config/default/manager_webhook_patch.yaml new file mode 100644 index 0000000..ad299a5 --- /dev/null +++ b/config/default/manager_webhook_patch.yaml @@ -0,0 +1,26 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: controller-manager + namespace: system + labels: + app.kubernetes.io/name: tensor-fusion-operator + app.kubernetes.io/managed-by: kustomize +spec: + template: + spec: + containers: + - name: manager + ports: + - containerPort: 9443 + name: webhook-server + protocol: TCP + volumeMounts: + - mountPath: /tmp/k8s-webhook-server/serving-certs + name: cert + readOnly: true + volumes: + - name: cert + secret: + defaultMode: 420 + secretName: webhook-server-cert diff --git a/config/network-policy/allow-webhook-traffic.yaml b/config/network-policy/allow-webhook-traffic.yaml new file mode 100644 index 0000000..9076e88 --- /dev/null +++ b/config/network-policy/allow-webhook-traffic.yaml @@ -0,0 +1,26 @@ +# This NetworkPolicy allows ingress traffic to your webhook server running +# as part of the controller-manager from specific namespaces and pods. CR(s) which uses webhooks +# will only work when applied in namespaces labeled with 'webhook: enabled' +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + labels: + app.kubernetes.io/name: tensor-fusion-operator + app.kubernetes.io/managed-by: kustomize + name: allow-webhook-traffic + namespace: system +spec: + podSelector: + matchLabels: + control-plane: controller-manager + policyTypes: + - Ingress + ingress: + # This allows ingress traffic from any namespace with the label webhook: enabled + - from: + - namespaceSelector: + matchLabels: + webhook: enabled # Only from namespaces with this label + ports: + - port: 443 + protocol: TCP diff --git a/config/network-policy/kustomization.yaml b/config/network-policy/kustomization.yaml index ec0fb5e..0872bee 100644 --- a/config/network-policy/kustomization.yaml +++ b/config/network-policy/kustomization.yaml @@ -1,2 +1,3 @@ resources: +- allow-webhook-traffic.yaml - allow-metrics-traffic.yaml diff --git a/config/webhook/kustomization.yaml b/config/webhook/kustomization.yaml new file mode 100644 index 0000000..9cf2613 --- /dev/null +++ b/config/webhook/kustomization.yaml @@ -0,0 +1,6 @@ +resources: +- manifests.yaml +- service.yaml + +configurations: +- kustomizeconfig.yaml diff --git a/config/webhook/kustomizeconfig.yaml b/config/webhook/kustomizeconfig.yaml new file mode 100644 index 0000000..206316e --- /dev/null +++ b/config/webhook/kustomizeconfig.yaml @@ -0,0 +1,22 @@ +# the following config is for teaching kustomize where to look at when substituting nameReference. +# It requires kustomize v2.1.0 or newer to work properly. +nameReference: +- kind: Service + version: v1 + fieldSpecs: + - kind: MutatingWebhookConfiguration + group: admissionregistration.k8s.io + path: webhooks/clientConfig/service/name + - kind: ValidatingWebhookConfiguration + group: admissionregistration.k8s.io + path: webhooks/clientConfig/service/name + +namespace: +- kind: MutatingWebhookConfiguration + group: admissionregistration.k8s.io + path: webhooks/clientConfig/service/namespace + create: true +- kind: ValidatingWebhookConfiguration + group: admissionregistration.k8s.io + path: webhooks/clientConfig/service/namespace + create: true diff --git a/config/webhook/service.yaml b/config/webhook/service.yaml new file mode 100644 index 0000000..409f372 --- /dev/null +++ b/config/webhook/service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/name: tensor-fusion-operator + app.kubernetes.io/managed-by: kustomize + name: webhook-service + namespace: system +spec: + ports: + - port: 443 + protocol: TCP + targetPort: 9443 + selector: + control-plane: controller-manager diff --git a/go.mod b/go.mod index 0b333c1..deab21d 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ require ( github.com/gin-gonic/gin v1.10.0 github.com/onsi/ginkgo/v2 v2.19.0 github.com/onsi/gomega v1.33.1 + k8s.io/api v0.31.0 k8s.io/apimachinery v0.31.0 k8s.io/client-go v0.31.0 sigs.k8s.io/controller-runtime v0.19.1 @@ -104,7 +105,6 @@ require ( gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/api v0.31.0 // indirect k8s.io/apiextensions-apiserver v0.31.0 // indirect k8s.io/apiserver v0.31.0 // indirect k8s.io/component-base v0.31.0 // indirect diff --git a/internal/config/config.go b/internal/config/config.go new file mode 100644 index 0000000..1146c2b --- /dev/null +++ b/internal/config/config.go @@ -0,0 +1,16 @@ +package config + +import corev1 "k8s.io/api/core/v1" + +type Config struct { + PodMutator PodMutator `json:"podMutator"` +} + +type PodMutator struct { + PatchStrategicMerge corev1.Pod `json:"patchStrategicMerge"` + PatchEnvVars []corev1.EnvVar `json:"envVars"` +} + +func NewDefaultConfig() Config { + return Config{} +} diff --git a/internal/webhook/v1/pod_webhook.go b/internal/webhook/v1/pod_webhook.go new file mode 100644 index 0000000..3140b53 --- /dev/null +++ b/internal/webhook/v1/pod_webhook.go @@ -0,0 +1,226 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1 + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + + tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1" + "github.com/NexusGPU/tensor-fusion-operator/internal/config" + "gomodules.xyz/jsonpatch/v2" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/strategicpatch" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/webhook/admission" +) + +// SetupPodWebhookWithManager registers the webhook for Pod in the manager. +func SetupPodWebhookWithManager(mgr ctrl.Manager, config *config.PodMutator) error { + webhookServer := mgr.GetWebhookServer() + webhookServer.Register("/mutate-v1-pod", + &admission.Webhook{ + Handler: &TensorFusionPodMutator{ + Config: config, + Client: mgr.GetClient(), + }, + }) + return nil +} + +type TensorFusionPodMutator struct { + Client client.Client + Config *config.PodMutator + decoder admission.Decoder +} + +// Handle implements admission.Handler interface. +func (m *TensorFusionPodMutator) Handle(ctx context.Context, req admission.Request) admission.Response { + pod := &corev1.Pod{} + if err := m.decoder.Decode(req, pod); err != nil { + return admission.Errored(http.StatusBadRequest, err) + } + + log := log.FromContext(ctx) + log.Info("Mutating pod", "name", pod.Name, "namespace", pod.Namespace) + + reqs := parseTFReq(pod) + // 1. Inject initContainer and env variables + patches, err := m.patchTFClient(pod, reqs) + if err != nil { + return admission.Errored(http.StatusInternalServerError, err) + } + + // generate tensor fusion connections and apply to cluster + tfConnections := generateTensorFusionConnection(pod, reqs) + + for _, tfConnection := range tfConnections { + if err := m.Client.Create(ctx, tfConnection); err != nil { + log.Error(err, "Failed to create TensorFusionConnection") + return admission.Errored(http.StatusInternalServerError, err) + } + } + + return admission.Patched("tensor fusion component patched", patches...) +} + +// InjectDecoder injects the decoder. +func (m *TensorFusionPodMutator) InjectDecoder(d admission.Decoder) error { + m.decoder = d + return nil +} + +type TFReq struct { + ContainerName string + Tflops resource.Quantity + Vram resource.Quantity +} + +func parseTFReq(pod *corev1.Pod) []TFReq { + if pod.Annotations == nil { + return nil + } + + reqs := make([]TFReq, 0, len(pod.Spec.Containers)) + + for _, container := range pod.Spec.Containers { + containerName := container.Name + + // Check if tensor fusion is enabled for this container + enableKey := fmt.Sprintf("tensor-fusion.ai/enable-%s", containerName) + if enableStr, ok := pod.Annotations[enableKey]; !ok || enableStr != "true" { + continue + } + + req := TFReq{ + ContainerName: containerName, + } + + // Parse TFLOPS requirement + tflopsKey := fmt.Sprintf("tensor-fusion.ai/tflops-%s", containerName) + if tflopsStr, ok := pod.Annotations[tflopsKey]; ok { + tflops, err := resource.ParseQuantity(tflopsStr) + if err == nil { + req.Tflops = tflops + } + } + + // Parse VRAM requirement + vramKey := fmt.Sprintf("tensor-fusion.ai/vram-%s", containerName) + if vramStr, ok := pod.Annotations[vramKey]; ok { + vram, err := resource.ParseQuantity(vramStr) + if err == nil { + req.Vram = vram + } + } + + reqs = append(reqs, req) + } + + return reqs +} + +func (m *TensorFusionPodMutator) patchTFClient(pod *corev1.Pod, tfReq []TFReq) ([]jsonpatch.JsonPatchOperation, error) { + podPatch := m.Config.PatchStrategicMerge + // Copy containers + podPatch.Spec.Containers = append([]corev1.Container{}, podPatch.Spec.Containers...) + + // Patch env vars + for _, req := range tfReq { + for _, container := range podPatch.Spec.Containers { + if container.Name == req.ContainerName { + container.Env = append(container.Env, m.Config.PatchEnvVars...) + } + } + } + + // Convert the strategic merge patch to JSON + patchBytes, err := json.Marshal(m.Config.PatchStrategicMerge) + if err != nil { + return nil, fmt.Errorf("marshal patch: %v", err) + } + + // Convert the current pod to JSON + currentBytes, err := json.Marshal(pod) + if err != nil { + return nil, fmt.Errorf("marshal current pod: %v", err) + } + + // Apply the strategic merge patch + resultBytes, err := strategicpatch.StrategicMergePatch(currentBytes, patchBytes, corev1.Pod{}) + if err != nil { + return nil, fmt.Errorf("apply strategic merge patch: %v", err) + } + + // Generate JSON patch operations by comparing original and patched pod + patches, err := jsonpatch.CreatePatch(currentBytes, resultBytes) + if err != nil { + return nil, fmt.Errorf("create json patch: %v", err) + } + + // Unmarshal the result back into the pod + if err := json.Unmarshal(resultBytes, pod); err != nil { + return nil, fmt.Errorf("unmarshal patched pod: %v", err) + } + + return patches, nil +} + +func generateTensorFusionConnection(pod *corev1.Pod, tfReq []TFReq) []*tfv1.TensorFusionConnection { + connections := make([]*tfv1.TensorFusionConnection, 0, len(tfReq)) + + for _, req := range tfReq { + connection := &tfv1.TensorFusionConnection{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf("%s-tf-%s", pod.Name, req.ContainerName), + Namespace: pod.Namespace, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "v1", + Kind: "Pod", + Name: pod.Name, + UID: pod.UID, + }, + }, + }, + Spec: tfv1.TensorFusionConnectionSpec{ + Resources: tfv1.Resources{ + Request: tfv1.Resource{ + Tflops: req.Tflops, + Vram: req.Vram, + }, + Limit: tfv1.Resource{ + Tflops: req.Tflops, + Vram: req.Vram, + }, + }, + }, + Status: tfv1.TensorFusionConnectionStatus{ + Phase: tfv1.TensorFusionConnectionPending, + }, + } + connections = append(connections, connection) + } + + return connections +} diff --git a/internal/webhook/v1/pod_webhook_test.go b/internal/webhook/v1/pod_webhook_test.go new file mode 100644 index 0000000..b83a9f1 --- /dev/null +++ b/internal/webhook/v1/pod_webhook_test.go @@ -0,0 +1,55 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1 + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + corev1 "k8s.io/api/core/v1" + // TODO (user): Add any additional imports if needed +) + +var _ = Describe("Pod Webhook", func() { + var ( + obj *corev1.Pod + oldObj *corev1.Pod + ) + + BeforeEach(func() { + obj = &corev1.Pod{} + oldObj = &corev1.Pod{} + Expect(oldObj).NotTo(BeNil(), "Expected oldObj to be initialized") + Expect(obj).NotTo(BeNil(), "Expected obj to be initialized") + // TODO (user): Add any setup logic common to all tests + }) + + AfterEach(func() { + // TODO (user): Add any teardown logic common to all tests + }) + + Context("When creating Pod under Conversion Webhook", func() { + // TODO (user): Add logic to convert the object to the desired version and verify the conversion + // Example: + // It("Should convert the object correctly", func() { + // convertedObj := &corev1.Pod{} + // Expect(obj.ConvertTo(convertedObj)).To(Succeed()) + // Expect(convertedObj).ToNot(BeNil()) + // }) + }) + +}) diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index aeac7d0..67c63dc 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -234,6 +234,16 @@ var _ = Describe("Manager", Ordered, func() { )) }) + It("should provisioned cert-manager", func() { + By("validating that cert-manager has the certificate Secret") + verifyCertManager := func(g Gomega) { + cmd := exec.Command("kubectl", "get", "secrets", "webhook-server-cert", "-n", namespace) + _, err := utils.Run(cmd) + g.Expect(err).NotTo(HaveOccurred()) + } + Eventually(verifyCertManager).Should(Succeed()) + }) + // +kubebuilder:scaffold:e2e-webhooks-checks // TODO: Customize the e2e test suite with scenarios specific to your project. From 4a2ec8bf3d862f567754827c2de043ccd6420f57 Mon Sep 17 00:00:00 2001 From: 0x5457 <0x5457@protonmail.com> Date: Thu, 5 Dec 2024 07:41:27 +0000 Subject: [PATCH 09/22] feat: implement resource cleanup for TensorFusionConnection - Add finalizer handling in TensorFusionConnection controller - Implement Release method in NaiveScheduler for resource cleanup This change ensures proper cleanup of GPU resources when a TensorFusionConnection is deleted, preventing resource leaks. --- api/v1/tensorfusionconnection_types.go | 1 + internal/constants/constants.go | 15 ++++ .../tensorfusionconnection_controller.go | 79 ++++++++++++++++++ internal/scheduler/naive.go | 15 ++++ internal/scheduler/naive_test.go | 80 +++++++++++++++++++ internal/scheduler/scheduler.go | 3 + internal/webhook/v1/pod_webhook.go | 8 +- 7 files changed, 198 insertions(+), 3 deletions(-) create mode 100644 internal/constants/constants.go diff --git a/api/v1/tensorfusionconnection_types.go b/api/v1/tensorfusionconnection_types.go index 955f227..00af5f2 100644 --- a/api/v1/tensorfusionconnection_types.go +++ b/api/v1/tensorfusionconnection_types.go @@ -49,6 +49,7 @@ type TensorFusionConnectionStatus struct { Phase TensorFusionConnectionPhase `json:"phase"` ConnectionURL string `json:"connectionURL"` QosClass string `json:"qosClass"` + Node string `json:"node,omitempty"` } // +kubebuilder:object:root=true diff --git a/internal/constants/constants.go b/internal/constants/constants.go new file mode 100644 index 0000000..acf7eee --- /dev/null +++ b/internal/constants/constants.go @@ -0,0 +1,15 @@ +package constants + +const ( + // TensorFusionDomain is the domain prefix used for all tensor-fusion.ai related annotations and finalizers + TensorFusionDomain = "tensor-fusion.ai" + + // Finalizer constants + TensorFusionFinalizerSuffix = "finalizer" + TensorFusionFinalizer = TensorFusionDomain + "/" + TensorFusionFinalizerSuffix + + // Annotation key constants + EnableContainerAnnotationFormat = TensorFusionDomain + "/enable-%s" + TFLOPSContainerAnnotationFormat = TensorFusionDomain + "/tflops-%s" + VRAMContainerAnnotationFormat = TensorFusionDomain + "/vram-%s" +) diff --git a/internal/controller/tensorfusionconnection_controller.go b/internal/controller/tensorfusionconnection_controller.go index b9223f4..8e14687 100644 --- a/internal/controller/tensorfusionconnection_controller.go +++ b/internal/controller/tensorfusionconnection_controller.go @@ -27,6 +27,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1" + "github.com/NexusGPU/tensor-fusion-operator/internal/constants" scheduler "github.com/NexusGPU/tensor-fusion-operator/internal/scheduler" "github.com/NexusGPU/tensor-fusion-operator/internal/worker" ) @@ -38,6 +39,10 @@ type TensorFusionConnectionReconciler struct { Scheduler scheduler.Scheduler } +var ( + tensorFusionConnectionFinalizer = constants.TensorFusionFinalizer +) + // +kubebuilder:rbac:groups=tensor-fusion.ai.tensor-fusion.ai,resources=tensorfusionconnections,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=tensor-fusion.ai.tensor-fusion.ai,resources=tensorfusionconnections/status,verbs=get;update;patch // +kubebuilder:rbac:groups=tensor-fusion.ai.tensor-fusion.ai,resources=tensorfusionconnections/finalizers,verbs=update @@ -58,6 +63,35 @@ func (r *TensorFusionConnectionReconciler) Reconcile(ctx context.Context, req ct return ctrl.Result{}, err } + // Check if the connection is being deleted + if !connection.DeletionTimestamp.IsZero() { + // The object is being deleted + if containsString(connection.Finalizers, tensorFusionConnectionFinalizer) { + // Our finalizer is present, so let's handle our external dependency + if err := r.handleDeletion(ctx, connection); err != nil { + return ctrl.Result{}, err + } + + // Remove our finalizer from the list and update it + connection.Finalizers = removeString(connection.Finalizers, tensorFusionConnectionFinalizer) + if err := r.Update(ctx, connection); err != nil { + return ctrl.Result{}, err + } + } + // Our finalizer has finished, so the reconciler can do nothing + return ctrl.Result{}, nil + } + + // Add finalizer if it's not present + if !containsString(connection.Finalizers, tensorFusionConnectionFinalizer) { + connection.Finalizers = append(connection.Finalizers, tensorFusionConnectionFinalizer) + if err := r.Update(ctx, connection); err != nil { + return ctrl.Result{}, err + } + // Return here as the update will trigger another reconciliation + return ctrl.Result{}, nil + } + var node *tfv1.GPUNode // If status is not set or pending, try to schedule if connection.Status.Phase == "" || connection.Status.Phase == tfv1.TensorFusionConnectionPending { @@ -69,6 +103,7 @@ func (r *TensorFusionConnectionReconciler) Reconcile(ctx context.Context, req ct } else if node != nil { connection.Status.Phase = tfv1.TensorFusionConnectionRunning connection.Status.ConnectionURL = worker.GenerateConnectionURL(node, connection) + connection.Status.Node = node.Name // Store the node name for cleanup } else { connection.Status.Phase = tfv1.TensorFusionConnectionPending } @@ -81,6 +116,50 @@ func (r *TensorFusionConnectionReconciler) Reconcile(ctx context.Context, req ct return ctrl.Result{}, nil } +// handleDeletion handles cleanup of external dependencies +func (r *TensorFusionConnectionReconciler) handleDeletion(ctx context.Context, connection *tfv1.TensorFusionConnection) error { + if connection.Status.Node == "" { + return nil // No node was allocated, nothing to clean up + } + + // Get the node + node := &tfv1.GPUNode{} + if err := r.Get(ctx, client.ObjectKey{Name: connection.Status.Node}, node); err != nil { + if errors.IsNotFound(err) { + // Node is already gone, nothing to do + return nil + } + return err + } + + // Release the resources + if err := r.Scheduler.Release(node); err != nil { + return err + } + + return nil +} + +// Helper functions to handle finalizers +func containsString(slice []string, s string) bool { + for _, item := range slice { + if item == s { + return true + } + } + return false +} + +func removeString(slice []string, s string) []string { + result := []string{} + for _, item := range slice { + if item != s { + result = append(result, item) + } + } + return result +} + func (r *TensorFusionConnectionReconciler) MustUpdateStatus(ctx context.Context, connection *tfv1.TensorFusionConnection, gpuNode *tfv1.GPUNode) error { return retry.RetryOnConflict(retry.DefaultBackoff, func() error { // Get the latest version of the connection diff --git a/internal/scheduler/naive.go b/internal/scheduler/naive.go index 8d9a4d3..0192173 100644 --- a/internal/scheduler/naive.go +++ b/internal/scheduler/naive.go @@ -58,3 +58,18 @@ func (s *NaiveScheduler) OnDelete(node *tfv1.GPUNode) { defer s.Unlock() delete(s.nodes, node.Name) } + +// Release implements Scheduler interface +func (s *NaiveScheduler) Release(node *tfv1.GPUNode) error { + s.Lock() + defer s.Unlock() + + existingNode, ok := s.nodes[node.Name] + if !ok { + return fmt.Errorf("node %s not found", node.Name) + } + + // Reset the node's available resources to its capacity + existingNode.Status.Available = existingNode.Status.Capacity + return nil +} diff --git a/internal/scheduler/naive_test.go b/internal/scheduler/naive_test.go index be1affd..f6367a2 100644 --- a/internal/scheduler/naive_test.go +++ b/internal/scheduler/naive_test.go @@ -164,3 +164,83 @@ func TestNaiveScheduler_NodeOperations(t *testing.T) { t.Error("After OnDelete: Schedule() should fail with no nodes") } } + +func TestNaiveScheduler_Release(t *testing.T) { + tests := []struct { + name string + node *tfv1.GPUNode + schedule *tfv1.Resource + wantError bool + }{ + { + name: "release non-existent node", + node: createGPUNode("node1", "100", "16Gi"), + wantError: true, + }, + { + name: "release after scheduling", + node: &tfv1.GPUNode{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + }, + Status: tfv1.GPUNodeStatus{ + Capacity: tfv1.Resource{ + Tflops: resource.MustParse("100"), + Vram: resource.MustParse("16Gi"), + }, + Available: tfv1.Resource{ + Tflops: resource.MustParse("100"), + Vram: resource.MustParse("16Gi"), + }, + }, + }, + schedule: &tfv1.Resource{ + Tflops: resource.MustParse("50"), + Vram: resource.MustParse("8Gi"), + }, + wantError: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + s := NewNaiveScheduler() + + if !tt.wantError { + // Add the node first + s.OnAdd(tt.node) + + // Schedule some resources if needed + if tt.schedule != nil { + node, err := s.Schedule(*tt.schedule) + if err != nil { + t.Errorf("Schedule() error = %v", err) + return + } + + // Verify resources were allocated + if node.Status.Available.Tflops.Cmp(resource.MustParse("50")) != 0 || + node.Status.Available.Vram.Cmp(resource.MustParse("8Gi")) != 0 { + t.Errorf("Schedule() did not allocate resources correctly") + return + } + } + } + + err := s.Release(tt.node) + if (err != nil) != tt.wantError { + t.Errorf("Release() error = %v, wantError %v", err, tt.wantError) + return + } + + if !tt.wantError { + // Verify resources were restored + node := s.nodes[tt.node.Name] + if node.Status.Available.Tflops.Cmp(node.Status.Capacity.Tflops) != 0 || + node.Status.Available.Vram.Cmp(node.Status.Capacity.Vram) != 0 { + t.Errorf("Release() did not restore resources correctly") + } + } + }) + } +} diff --git a/internal/scheduler/scheduler.go b/internal/scheduler/scheduler.go index 47ac776..163124c 100644 --- a/internal/scheduler/scheduler.go +++ b/internal/scheduler/scheduler.go @@ -11,6 +11,9 @@ type Scheduler interface { // an nil pointer and an error. Schedule(request tfv1.Resource) (*tfv1.GPUNode, error) + // Release frees the allocated resources of a node + Release(node *tfv1.GPUNode) error + // OnAdd is called when a new node is added OnAdd(node *tfv1.GPUNode) // OnUpdate is called when a node is modified diff --git a/internal/webhook/v1/pod_webhook.go b/internal/webhook/v1/pod_webhook.go index 3140b53..8edcb70 100644 --- a/internal/webhook/v1/pod_webhook.go +++ b/internal/webhook/v1/pod_webhook.go @@ -33,6 +33,8 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/webhook/admission" + + "github.com/NexusGPU/tensor-fusion-operator/internal/constants" ) // SetupPodWebhookWithManager registers the webhook for Pod in the manager. @@ -107,7 +109,7 @@ func parseTFReq(pod *corev1.Pod) []TFReq { containerName := container.Name // Check if tensor fusion is enabled for this container - enableKey := fmt.Sprintf("tensor-fusion.ai/enable-%s", containerName) + enableKey := fmt.Sprintf(constants.EnableContainerAnnotationFormat, containerName) if enableStr, ok := pod.Annotations[enableKey]; !ok || enableStr != "true" { continue } @@ -117,7 +119,7 @@ func parseTFReq(pod *corev1.Pod) []TFReq { } // Parse TFLOPS requirement - tflopsKey := fmt.Sprintf("tensor-fusion.ai/tflops-%s", containerName) + tflopsKey := fmt.Sprintf(constants.TFLOPSContainerAnnotationFormat, containerName) if tflopsStr, ok := pod.Annotations[tflopsKey]; ok { tflops, err := resource.ParseQuantity(tflopsStr) if err == nil { @@ -126,7 +128,7 @@ func parseTFReq(pod *corev1.Pod) []TFReq { } // Parse VRAM requirement - vramKey := fmt.Sprintf("tensor-fusion.ai/vram-%s", containerName) + vramKey := fmt.Sprintf(constants.VRAMContainerAnnotationFormat, containerName) if vramStr, ok := pod.Annotations[vramKey]; ok { vram, err := resource.ParseQuantity(vramStr) if err == nil { From 7a24ba3112cb01d13a07a919d420b240ed95b92b Mon Sep 17 00:00:00 2001 From: 0x5457 <0x5457@protonmail.com> Date: Thu, 5 Dec 2024 07:56:15 +0000 Subject: [PATCH 10/22] fix lint --- internal/scheduler/naive_test.go | 1 + test/e2e/e2e_test.go | 12 ++++++------ test/utils/utils.go | 4 ++-- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/internal/scheduler/naive_test.go b/internal/scheduler/naive_test.go index f6367a2..823f8c4 100644 --- a/internal/scheduler/naive_test.go +++ b/internal/scheduler/naive_test.go @@ -22,6 +22,7 @@ func createGPUNode(name string, tflops, vram string) *tfv1.GPUNode { } } +//nolint:unparam func createRequest(tflops, vram string) tfv1.Resource { return tfv1.Resource{ Tflops: resource.MustParse(tflops), diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index 67c63dc..f5ff7c5 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -93,27 +93,27 @@ var _ = Describe("Manager", Ordered, func() { cmd := exec.Command("kubectl", "logs", controllerPodName, "-n", namespace) controllerLogs, err := utils.Run(cmd) if err == nil { - _, _ = fmt.Fprintf(GinkgoWriter, fmt.Sprintf("Controller logs:\n %s", controllerLogs)) + _, _ = fmt.Fprintf(GinkgoWriter, "Controller logs:\n %s", controllerLogs) } else { - _, _ = fmt.Fprintf(GinkgoWriter, fmt.Sprintf("Failed to get Controller logs: %s", err)) + _, _ = fmt.Fprintf(GinkgoWriter, "Failed to get Controller logs: %s", err) } By("Fetching Kubernetes events") cmd = exec.Command("kubectl", "get", "events", "-n", namespace, "--sort-by=.lastTimestamp") eventsOutput, err := utils.Run(cmd) if err == nil { - _, _ = fmt.Fprintf(GinkgoWriter, fmt.Sprintf("Kubernetes events:\n%s", eventsOutput)) + _, _ = fmt.Fprintf(GinkgoWriter, "Kubernetes events:\n%s", eventsOutput) } else { - _, _ = fmt.Fprintf(GinkgoWriter, fmt.Sprintf("Failed to get Kubernetes events: %s", err)) + _, _ = fmt.Fprintf(GinkgoWriter, "Failed to get Kubernetes events: %s", err) } By("Fetching curl-metrics logs") cmd = exec.Command("kubectl", "logs", "curl-metrics", "-n", namespace) metricsOutput, err := utils.Run(cmd) if err == nil { - _, _ = fmt.Fprintf(GinkgoWriter, fmt.Sprintf("Metrics logs:\n %s", metricsOutput)) + _, _ = fmt.Fprintf(GinkgoWriter, "Metrics logs:\n %s", metricsOutput) } else { - _, _ = fmt.Fprintf(GinkgoWriter, fmt.Sprintf("Failed to get curl-metrics logs: %s", err)) + _, _ = fmt.Fprintf(GinkgoWriter, "Failed to get curl-metrics logs: %s", err) } By("Fetching controller manager pod description") diff --git a/test/utils/utils.go b/test/utils/utils.go index c3d51ce..8319bc4 100644 --- a/test/utils/utils.go +++ b/test/utils/utils.go @@ -92,7 +92,7 @@ func IsPrometheusCRDsInstalled() bool { if err != nil { return false } - crdList := GetNonEmptyLines(string(output)) + crdList := GetNonEmptyLines(output) for _, crd := range prometheusCRDs { for _, line := range crdList { if strings.Contains(line, crd) { @@ -153,7 +153,7 @@ func IsCertManagerCRDsInstalled() bool { } // Check if any of the Cert Manager CRDs are present - crdList := GetNonEmptyLines(string(output)) + crdList := GetNonEmptyLines(output) for _, crd := range certManagerCRDs { for _, line := range crdList { if strings.Contains(line, crd) { From a6f7ca835f4b89ab7b1b4ba57892e7cac4528927 Mon Sep 17 00:00:00 2001 From: 0x5457 <0x5457@protonmail.com> Date: Thu, 5 Dec 2024 16:42:13 +0000 Subject: [PATCH 11/22] chore: init naive scheduler --- cmd/main.go | 10 +++++++--- ...n.ai.tensor-fusion.ai_tensorfusionconnections.yaml | 2 ++ internal/controller/gpunode_controller.go | 11 ----------- 3 files changed, 9 insertions(+), 14 deletions(-) diff --git a/cmd/main.go b/cmd/main.go index 7f7bf2b..3ed420c 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -40,6 +40,7 @@ import ( tensorfusionaiv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1" "github.com/NexusGPU/tensor-fusion-operator/internal/config" "github.com/NexusGPU/tensor-fusion-operator/internal/controller" + "github.com/NexusGPU/tensor-fusion-operator/internal/scheduler" "github.com/NexusGPU/tensor-fusion-operator/internal/server" "github.com/NexusGPU/tensor-fusion-operator/internal/server/router" webhookcorev1 "github.com/NexusGPU/tensor-fusion-operator/internal/webhook/v1" @@ -158,9 +159,12 @@ func main() { setupLog.Error(err, "unable to create controller", "controller", "TensorFusionConnection") os.Exit(1) } + + scheduler := scheduler.NewNaiveScheduler() if err = (&controller.GPUNodeReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Scheduler: scheduler, }).SetupWithManager(ctx, mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "GPUNode") os.Exit(1) @@ -173,8 +177,8 @@ func main() { os.Exit(1) } } - // +kubebuilder:scaffold:builder + // +kubebuilder:scaffold:builder if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { setupLog.Error(err, "unable to set up health check") os.Exit(1) diff --git a/config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_tensorfusionconnections.yaml b/config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_tensorfusionconnections.yaml index 135776a..c9d1ec9 100644 --- a/config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_tensorfusionconnections.yaml +++ b/config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_tensorfusionconnections.yaml @@ -91,6 +91,8 @@ spec: properties: connectionURL: type: string + node: + type: string phase: type: string qosClass: diff --git a/internal/controller/gpunode_controller.go b/internal/controller/gpunode_controller.go index bab8ff5..2c1e2a9 100644 --- a/internal/controller/gpunode_controller.go +++ b/internal/controller/gpunode_controller.go @@ -49,17 +49,6 @@ func (r *GPUNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct // SetupWithManager sets up the controller with the Manager. func (r *GPUNodeReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager) error { - // List all existing GPUNodes and add them to scheduler - existingNodes := &tfv1.GPUNodeList{} - if err := r.List(ctx, existingNodes); err != nil { - return err - } - - // Add all existing nodes to scheduler - for i := range existingNodes.Items { - r.Scheduler.OnAdd(&existingNodes.Items[i]) - } - return ctrl.NewControllerManagedBy(mgr). For(&tfv1.GPUNode{}). Named("gpunode"). From 9155aabf755d223fd8fdb0d427d8a46ab5ea72e7 Mon Sep 17 00:00:00 2001 From: 0x5457 <0x5457@protonmail.com> Date: Fri, 6 Dec 2024 01:08:39 +0800 Subject: [PATCH 12/22] chmore: rename group --- PROJECT | 2 - README.md | 114 +++++++++++++++++- api/v1/groupversion_info.go | 4 +- ...es.yaml => tensor-fusion.ai_gpunodes.yaml} | 4 +- ...or-fusion.ai_tensorfusionconnections.yaml} | 4 +- config/crd/kustomization.yaml | 4 +- config/rbac/gpunode_editor_role.yaml | 4 +- config/rbac/gpunode_viewer_role.yaml | 4 +- config/rbac/role.yaml | 6 +- .../tensorfusionconnection_editor_role.yaml | 4 +- .../tensorfusionconnection_viewer_role.yaml | 4 +- config/samples/kustomization.yaml | 4 +- ...ion.ai_v1_gpunode.yaml => v1_gpunode.yaml} | 2 +- ...on.yaml => v1_tensorfusionconnection.yaml} | 2 +- internal/controller/gpunode_controller.go | 6 +- .../tensorfusionconnection_controller.go | 6 +- 16 files changed, 142 insertions(+), 32 deletions(-) rename config/crd/bases/{tensor-fusion.ai.tensor-fusion.ai_gpunodes.yaml => tensor-fusion.ai_gpunodes.yaml} (97%) rename config/crd/bases/{tensor-fusion.ai.tensor-fusion.ai_tensorfusionconnections.yaml => tensor-fusion.ai_tensorfusionconnections.yaml} (97%) rename config/samples/{tensor-fusion.ai_v1_gpunode.yaml => v1_gpunode.yaml} (79%) rename config/samples/{tensor-fusion.ai_v1_tensorfusionconnection.yaml => v1_tensorfusionconnection.yaml} (81%) diff --git a/PROJECT b/PROJECT index cde7c4f..6730b7f 100644 --- a/PROJECT +++ b/PROJECT @@ -13,7 +13,6 @@ resources: namespaced: true controller: true domain: tensor-fusion.ai - group: tensor-fusion.ai kind: TensorFusionConnection path: github.com/NexusGPU/tensor-fusion-operator/api/v1 version: v1 @@ -22,7 +21,6 @@ resources: namespaced: true controller: true domain: tensor-fusion.ai - group: tensor-fusion.ai kind: GPUNode path: github.com/NexusGPU/tensor-fusion-operator/api/v1 version: v1 diff --git a/README.md b/README.md index f1370f4..f2daa2f 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,114 @@ # tensor-fusion-operator -Tensor Fusion operator including custom resources, admission webhooks, metrics aggregators, cluster management APIs, cloud integration etc. +// TODO(user): Add simple overview of use/purpose + +## Description +// TODO(user): An in-depth paragraph about your project and overview of use + +## Getting Started + +### Prerequisites +- go version v1.22.0+ +- docker version 17.03+. +- kubectl version v1.11.3+. +- Access to a Kubernetes v1.11.3+ cluster. + +### To Deploy on the cluster +**Build and push your image to the location specified by `IMG`:** + +```sh +make docker-build docker-push IMG=/tensor-fusion-operator:tag +``` + +**NOTE:** This image ought to be published in the personal registry you specified. +And it is required to have access to pull the image from the working environment. +Make sure you have the proper permission to the registry if the above commands don’t work. + +**Install the CRDs into the cluster:** + +```sh +make install +``` + +**Deploy the Manager to the cluster with the image specified by `IMG`:** + +```sh +make deploy IMG=/tensor-fusion-operator:tag +``` + +> **NOTE**: If you encounter RBAC errors, you may need to grant yourself cluster-admin +privileges or be logged in as admin. + +**Create instances of your solution** +You can apply the samples (examples) from the config/sample: + +```sh +kubectl apply -k config/samples/ +``` + +>**NOTE**: Ensure that the samples has default values to test it out. + +### To Uninstall +**Delete the instances (CRs) from the cluster:** + +```sh +kubectl delete -k config/samples/ +``` + +**Delete the APIs(CRDs) from the cluster:** + +```sh +make uninstall +``` + +**UnDeploy the controller from the cluster:** + +```sh +make undeploy +``` + +## Project Distribution + +Following are the steps to build the installer and distribute this project to users. + +1. Build the installer for the image built and published in the registry: + +```sh +make build-installer IMG=/tensor-fusion-operator:tag +``` + +NOTE: The makefile target mentioned above generates an 'install.yaml' +file in the dist directory. This file contains all the resources built +with Kustomize, which are necessary to install this project without +its dependencies. + +2. Using the installer + +Users can just run kubectl apply -f to install the project, i.e.: + +```sh +kubectl apply -f https://raw.githubusercontent.com//tensor-fusion-operator//dist/install.yaml +``` + +## Contributing +// TODO(user): Add detailed information on how you would like others to contribute to this project + +**NOTE:** Run `make help` for more information on all potential `make` targets + +More information can be found via the [Kubebuilder Documentation](https://book.kubebuilder.io/introduction.html) + +## License + +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + diff --git a/api/v1/groupversion_info.go b/api/v1/groupversion_info.go index 9172ec6..72aaccc 100644 --- a/api/v1/groupversion_info.go +++ b/api/v1/groupversion_info.go @@ -16,7 +16,7 @@ limitations under the License. // Package v1 contains API Schema definitions for the tensor-fusion.ai v1 API group. // +kubebuilder:object:generate=true -// +groupName=tensor-fusion.ai.tensor-fusion.ai +// +groupName=tensor-fusion.ai package v1 import ( @@ -26,7 +26,7 @@ import ( var ( // GroupVersion is group version used to register these objects. - GroupVersion = schema.GroupVersion{Group: "tensor-fusion.ai.tensor-fusion.ai", Version: "v1"} + GroupVersion = schema.GroupVersion{Group: "tensor-fusion.ai", Version: "v1"} // SchemeBuilder is used to add go types to the GroupVersionKind scheme. SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion} diff --git a/config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_gpunodes.yaml b/config/crd/bases/tensor-fusion.ai_gpunodes.yaml similarity index 97% rename from config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_gpunodes.yaml rename to config/crd/bases/tensor-fusion.ai_gpunodes.yaml index 4829b1f..34442aa 100644 --- a/config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_gpunodes.yaml +++ b/config/crd/bases/tensor-fusion.ai_gpunodes.yaml @@ -4,9 +4,9 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.16.4 - name: gpunodes.tensor-fusion.ai.tensor-fusion.ai + name: gpunodes.tensor-fusion.ai spec: - group: tensor-fusion.ai.tensor-fusion.ai + group: tensor-fusion.ai names: kind: GPUNode listKind: GPUNodeList diff --git a/config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_tensorfusionconnections.yaml b/config/crd/bases/tensor-fusion.ai_tensorfusionconnections.yaml similarity index 97% rename from config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_tensorfusionconnections.yaml rename to config/crd/bases/tensor-fusion.ai_tensorfusionconnections.yaml index c9d1ec9..9fb2714 100644 --- a/config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_tensorfusionconnections.yaml +++ b/config/crd/bases/tensor-fusion.ai_tensorfusionconnections.yaml @@ -4,9 +4,9 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.16.4 - name: tensorfusionconnections.tensor-fusion.ai.tensor-fusion.ai + name: tensorfusionconnections.tensor-fusion.ai spec: - group: tensor-fusion.ai.tensor-fusion.ai + group: tensor-fusion.ai names: kind: TensorFusionConnection listKind: TensorFusionConnectionList diff --git a/config/crd/kustomization.yaml b/config/crd/kustomization.yaml index 86141ab..ef965fc 100644 --- a/config/crd/kustomization.yaml +++ b/config/crd/kustomization.yaml @@ -2,8 +2,8 @@ # since it depends on service name and namespace that are out of this kustomize package. # It should be run by config/default resources: -- bases/tensor-fusion.ai.tensor-fusion.ai_tensorfusionconnections.yaml -- bases/tensor-fusion.ai.tensor-fusion.ai_gpunodes.yaml +- bases/tensor-fusion.ai_tensorfusionconnections.yaml +- bases/tensor-fusion.ai_gpunodes.yaml # +kubebuilder:scaffold:crdkustomizeresource patches: diff --git a/config/rbac/gpunode_editor_role.yaml b/config/rbac/gpunode_editor_role.yaml index 11c1526..10e6ec1 100644 --- a/config/rbac/gpunode_editor_role.yaml +++ b/config/rbac/gpunode_editor_role.yaml @@ -8,7 +8,7 @@ metadata: name: gpunode-editor-role rules: - apiGroups: - - tensor-fusion.ai.tensor-fusion.ai + - tensor-fusion.ai resources: - gpunodes verbs: @@ -20,7 +20,7 @@ rules: - update - watch - apiGroups: - - tensor-fusion.ai.tensor-fusion.ai + - tensor-fusion.ai resources: - gpunodes/status verbs: diff --git a/config/rbac/gpunode_viewer_role.yaml b/config/rbac/gpunode_viewer_role.yaml index a4808a0..376b12f 100644 --- a/config/rbac/gpunode_viewer_role.yaml +++ b/config/rbac/gpunode_viewer_role.yaml @@ -8,7 +8,7 @@ metadata: name: gpunode-viewer-role rules: - apiGroups: - - tensor-fusion.ai.tensor-fusion.ai + - tensor-fusion.ai resources: - gpunodes verbs: @@ -16,7 +16,7 @@ rules: - list - watch - apiGroups: - - tensor-fusion.ai.tensor-fusion.ai + - tensor-fusion.ai resources: - gpunodes/status verbs: diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index b5d3369..a2a838e 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -5,7 +5,7 @@ metadata: name: manager-role rules: - apiGroups: - - tensor-fusion.ai.tensor-fusion.ai + - tensor-fusion.ai resources: - gpunodes - tensorfusionconnections @@ -18,14 +18,14 @@ rules: - update - watch - apiGroups: - - tensor-fusion.ai.tensor-fusion.ai + - tensor-fusion.ai resources: - gpunodes/finalizers - tensorfusionconnections/finalizers verbs: - update - apiGroups: - - tensor-fusion.ai.tensor-fusion.ai + - tensor-fusion.ai resources: - gpunodes/status - tensorfusionconnections/status diff --git a/config/rbac/tensorfusionconnection_editor_role.yaml b/config/rbac/tensorfusionconnection_editor_role.yaml index d7627ed..dd1c5ff 100644 --- a/config/rbac/tensorfusionconnection_editor_role.yaml +++ b/config/rbac/tensorfusionconnection_editor_role.yaml @@ -8,7 +8,7 @@ metadata: name: tensorfusionconnection-editor-role rules: - apiGroups: - - tensor-fusion.ai.tensor-fusion.ai + - tensor-fusion.ai resources: - tensorfusionconnections verbs: @@ -20,7 +20,7 @@ rules: - update - watch - apiGroups: - - tensor-fusion.ai.tensor-fusion.ai + - tensor-fusion.ai resources: - tensorfusionconnections/status verbs: diff --git a/config/rbac/tensorfusionconnection_viewer_role.yaml b/config/rbac/tensorfusionconnection_viewer_role.yaml index 498b61e..e93e3c0 100644 --- a/config/rbac/tensorfusionconnection_viewer_role.yaml +++ b/config/rbac/tensorfusionconnection_viewer_role.yaml @@ -8,7 +8,7 @@ metadata: name: tensorfusionconnection-viewer-role rules: - apiGroups: - - tensor-fusion.ai.tensor-fusion.ai + - tensor-fusion.ai resources: - tensorfusionconnections verbs: @@ -16,7 +16,7 @@ rules: - list - watch - apiGroups: - - tensor-fusion.ai.tensor-fusion.ai + - tensor-fusion.ai resources: - tensorfusionconnections/status verbs: diff --git a/config/samples/kustomization.yaml b/config/samples/kustomization.yaml index 022e343..387e950 100644 --- a/config/samples/kustomization.yaml +++ b/config/samples/kustomization.yaml @@ -1,5 +1,5 @@ ## Append samples of your project ## resources: -- tensor-fusion.ai_v1_tensorfusionconnection.yaml -- tensor-fusion.ai_v1_gpunode.yaml +- v1_tensorfusionconnection.yaml +- v1_gpunode.yaml # +kubebuilder:scaffold:manifestskustomizesamples diff --git a/config/samples/tensor-fusion.ai_v1_gpunode.yaml b/config/samples/v1_gpunode.yaml similarity index 79% rename from config/samples/tensor-fusion.ai_v1_gpunode.yaml rename to config/samples/v1_gpunode.yaml index 0957bdb..0a5d491 100644 --- a/config/samples/tensor-fusion.ai_v1_gpunode.yaml +++ b/config/samples/v1_gpunode.yaml @@ -1,4 +1,4 @@ -apiVersion: tensor-fusion.ai.tensor-fusion.ai/v1 +apiVersion: tensor-fusion.ai/v1 kind: GPUNode metadata: labels: diff --git a/config/samples/tensor-fusion.ai_v1_tensorfusionconnection.yaml b/config/samples/v1_tensorfusionconnection.yaml similarity index 81% rename from config/samples/tensor-fusion.ai_v1_tensorfusionconnection.yaml rename to config/samples/v1_tensorfusionconnection.yaml index 91c2a95..3eb2690 100644 --- a/config/samples/tensor-fusion.ai_v1_tensorfusionconnection.yaml +++ b/config/samples/v1_tensorfusionconnection.yaml @@ -1,4 +1,4 @@ -apiVersion: tensor-fusion.ai.tensor-fusion.ai/v1 +apiVersion: tensor-fusion.ai/v1 kind: TensorFusionConnection metadata: labels: diff --git a/internal/controller/gpunode_controller.go b/internal/controller/gpunode_controller.go index 2c1e2a9..caea975 100644 --- a/internal/controller/gpunode_controller.go +++ b/internal/controller/gpunode_controller.go @@ -36,9 +36,9 @@ type GPUNodeReconciler struct { Scheduler scheduler.Scheduler } -// +kubebuilder:rbac:groups=tensor-fusion.ai.tensor-fusion.ai,resources=gpunodes,verbs=get;list;watch;create;update;patch;delete -// +kubebuilder:rbac:groups=tensor-fusion.ai.tensor-fusion.ai,resources=gpunodes/status,verbs=get;update;patch -// +kubebuilder:rbac:groups=tensor-fusion.ai.tensor-fusion.ai,resources=gpunodes/finalizers,verbs=update +// +kubebuilder:rbac:groups=tensor-fusion.ai,resources=gpunodes,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=tensor-fusion.ai,resources=gpunodes/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=tensor-fusion.ai,resources=gpunodes/finalizers,verbs=update // Reconcile is part of the main kubernetes reconciliation loop which aims to // move the current state of the cluster closer to the desired state. diff --git a/internal/controller/tensorfusionconnection_controller.go b/internal/controller/tensorfusionconnection_controller.go index 8e14687..d546459 100644 --- a/internal/controller/tensorfusionconnection_controller.go +++ b/internal/controller/tensorfusionconnection_controller.go @@ -43,9 +43,9 @@ var ( tensorFusionConnectionFinalizer = constants.TensorFusionFinalizer ) -// +kubebuilder:rbac:groups=tensor-fusion.ai.tensor-fusion.ai,resources=tensorfusionconnections,verbs=get;list;watch;create;update;patch;delete -// +kubebuilder:rbac:groups=tensor-fusion.ai.tensor-fusion.ai,resources=tensorfusionconnections/status,verbs=get;update;patch -// +kubebuilder:rbac:groups=tensor-fusion.ai.tensor-fusion.ai,resources=tensorfusionconnections/finalizers,verbs=update +// +kubebuilder:rbac:groups=tensor-fusion.ai,resources=tensorfusionconnections,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=tensor-fusion.ai,resources=tensorfusionconnections/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=tensor-fusion.ai,resources=tensorfusionconnections/finalizers,verbs=update // Reconcile is part of the main kubernetes reconciliation loop which aims to // move the current state of the cluster closer to the desired state. From e63deee4d67541f6e8d8218885bc1fde5421ca02 Mon Sep 17 00:00:00 2001 From: 0x5457 <0x5457@protonmail.com> Date: Fri, 6 Dec 2024 05:38:11 +0000 Subject: [PATCH 13/22] feat(webhook): implement pod mutating webhook and test infrastructure - Add webhook manifests for pod mutation - Remove conversion webhook config from PROJECT file - Implement webhook test suite with ginkgo framework - Update pod webhook tests to focus on defaulting instead of conversion - Add CA injection verification in e2e tests --- PROJECT | 1 - config/webhook/manifests.yaml | 26 ++++ internal/webhook/v1/pod_webhook_test.go | 22 ++-- internal/webhook/v1/webhook_suite_test.go | 149 ++++++++++++++++++++++ test/e2e/e2e_test.go | 14 ++ 5 files changed, 203 insertions(+), 9 deletions(-) create mode 100644 config/webhook/manifests.yaml create mode 100644 internal/webhook/v1/webhook_suite_test.go diff --git a/PROJECT b/PROJECT index 6730b7f..cc6a157 100644 --- a/PROJECT +++ b/PROJECT @@ -30,6 +30,5 @@ resources: path: k8s.io/api/core/v1 version: v1 webhooks: - conversion: true webhookVersion: v1 version: "3" diff --git a/config/webhook/manifests.yaml b/config/webhook/manifests.yaml new file mode 100644 index 0000000..ec70061 --- /dev/null +++ b/config/webhook/manifests.yaml @@ -0,0 +1,26 @@ +--- +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: mutating-webhook-configuration +webhooks: +- admissionReviewVersions: + - v1 + clientConfig: + service: + name: webhook-service + namespace: system + path: /mutate--v1-pod + failurePolicy: Fail + name: mpod-v1.kb.io + rules: + - apiGroups: + - "" + apiVersions: + - v1 + operations: + - CREATE + - UPDATE + resources: + - pods + sideEffects: None diff --git a/internal/webhook/v1/pod_webhook_test.go b/internal/webhook/v1/pod_webhook_test.go index b83a9f1..1fb7a60 100644 --- a/internal/webhook/v1/pod_webhook_test.go +++ b/internal/webhook/v1/pod_webhook_test.go @@ -26,13 +26,16 @@ import ( var _ = Describe("Pod Webhook", func() { var ( - obj *corev1.Pod - oldObj *corev1.Pod + obj *corev1.Pod + oldObj *corev1.Pod + defaulter PodCustomDefaulter ) BeforeEach(func() { obj = &corev1.Pod{} oldObj = &corev1.Pod{} + defaulter = PodCustomDefaulter{} + Expect(defaulter).NotTo(BeNil(), "Expected defaulter to be initialized") Expect(oldObj).NotTo(BeNil(), "Expected oldObj to be initialized") Expect(obj).NotTo(BeNil(), "Expected obj to be initialized") // TODO (user): Add any setup logic common to all tests @@ -42,13 +45,16 @@ var _ = Describe("Pod Webhook", func() { // TODO (user): Add any teardown logic common to all tests }) - Context("When creating Pod under Conversion Webhook", func() { - // TODO (user): Add logic to convert the object to the desired version and verify the conversion + Context("When creating Pod under Defaulting Webhook", func() { + // TODO (user): Add logic for defaulting webhooks // Example: - // It("Should convert the object correctly", func() { - // convertedObj := &corev1.Pod{} - // Expect(obj.ConvertTo(convertedObj)).To(Succeed()) - // Expect(convertedObj).ToNot(BeNil()) + // It("Should apply defaults when a required field is empty", func() { + // By("simulating a scenario where defaults should be applied") + // obj.SomeFieldWithDefault = "" + // By("calling the Default method to apply defaults") + // defaulter.Default(ctx, obj) + // By("checking that the default values are set") + // Expect(obj.SomeFieldWithDefault).To(Equal("default_value")) // }) }) diff --git a/internal/webhook/v1/webhook_suite_test.go b/internal/webhook/v1/webhook_suite_test.go new file mode 100644 index 0000000..2ab08ea --- /dev/null +++ b/internal/webhook/v1/webhook_suite_test.go @@ -0,0 +1,149 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1 + +import ( + "context" + "crypto/tls" + "fmt" + "net" + "path/filepath" + "runtime" + "testing" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + admissionv1 "k8s.io/api/admission/v1" + corev1 "k8s.io/api/core/v1" + + // +kubebuilder:scaffold:imports + apimachineryruntime "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/rest" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/envtest" + logf "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" + "sigs.k8s.io/controller-runtime/pkg/webhook" +) + +// These tests use Ginkgo (BDD-style Go testing framework). Refer to +// http://onsi.github.io/ginkgo/ to learn more about Ginkgo. + +var ( + cancel context.CancelFunc + cfg *rest.Config + ctx context.Context + k8sClient client.Client + testEnv *envtest.Environment +) + +func TestAPIs(t *testing.T) { + RegisterFailHandler(Fail) + + RunSpecs(t, "Webhook Suite") +} + +var _ = BeforeSuite(func() { + logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true))) + + ctx, cancel = context.WithCancel(context.TODO()) + + By("bootstrapping test environment") + testEnv = &envtest.Environment{ + CRDDirectoryPaths: []string{filepath.Join("..", "..", "..", "config", "crd", "bases")}, + ErrorIfCRDPathMissing: false, + + // The BinaryAssetsDirectory is only required if you want to run the tests directly + // without call the makefile target test. If not informed it will look for the + // default path defined in controller-runtime which is /usr/local/kubebuilder/. + // Note that you must have the required binaries setup under the bin directory to perform + // the tests directly. When we run make test it will be setup and used automatically. + BinaryAssetsDirectory: filepath.Join("..", "..", "..", "bin", "k8s", + fmt.Sprintf("1.31.0-%s-%s", runtime.GOOS, runtime.GOARCH)), + + WebhookInstallOptions: envtest.WebhookInstallOptions{ + Paths: []string{filepath.Join("..", "..", "..", "config", "webhook")}, + }, + } + + var err error + // cfg is defined in this file globally. + cfg, err = testEnv.Start() + Expect(err).NotTo(HaveOccurred()) + Expect(cfg).NotTo(BeNil()) + + scheme := apimachineryruntime.NewScheme() + err = corev1.AddToScheme(scheme) + Expect(err).NotTo(HaveOccurred()) + + err = admissionv1.AddToScheme(scheme) + Expect(err).NotTo(HaveOccurred()) + + // +kubebuilder:scaffold:scheme + + k8sClient, err = client.New(cfg, client.Options{Scheme: scheme}) + Expect(err).NotTo(HaveOccurred()) + Expect(k8sClient).NotTo(BeNil()) + + // start webhook server using Manager. + webhookInstallOptions := &testEnv.WebhookInstallOptions + mgr, err := ctrl.NewManager(cfg, ctrl.Options{ + Scheme: scheme, + WebhookServer: webhook.NewServer(webhook.Options{ + Host: webhookInstallOptions.LocalServingHost, + Port: webhookInstallOptions.LocalServingPort, + CertDir: webhookInstallOptions.LocalServingCertDir, + }), + LeaderElection: false, + Metrics: metricsserver.Options{BindAddress: "0"}, + }) + Expect(err).NotTo(HaveOccurred()) + + err = SetupPodWebhookWithManager(mgr) + Expect(err).NotTo(HaveOccurred()) + + // +kubebuilder:scaffold:webhook + + go func() { + defer GinkgoRecover() + err = mgr.Start(ctx) + Expect(err).NotTo(HaveOccurred()) + }() + + // wait for the webhook server to get ready. + dialer := &net.Dialer{Timeout: time.Second} + addrPort := fmt.Sprintf("%s:%d", webhookInstallOptions.LocalServingHost, webhookInstallOptions.LocalServingPort) + Eventually(func() error { + conn, err := tls.DialWithDialer(dialer, "tcp", addrPort, &tls.Config{InsecureSkipVerify: true}) + if err != nil { + return err + } + + return conn.Close() + }).Should(Succeed()) +}) + +var _ = AfterSuite(func() { + By("tearing down the test environment") + cancel() + err := testEnv.Stop() + Expect(err).NotTo(HaveOccurred()) +}) diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index f5ff7c5..b886624 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -244,6 +244,20 @@ var _ = Describe("Manager", Ordered, func() { Eventually(verifyCertManager).Should(Succeed()) }) + It("should have CA injection for mutating webhooks", func() { + By("checking CA injection for mutating webhooks") + verifyCAInjection := func(g Gomega) { + cmd := exec.Command("kubectl", "get", + "mutatingwebhookconfigurations.admissionregistration.k8s.io", + "tensor-fusion-operator-mutating-webhook-configuration", + "-o", "go-template={{ range .webhooks }}{{ .clientConfig.caBundle }}{{ end }}") + mwhOutput, err := utils.Run(cmd) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(len(mwhOutput)).To(BeNumerically(">", 10)) + } + Eventually(verifyCAInjection).Should(Succeed()) + }) + // +kubebuilder:scaffold:e2e-webhooks-checks // TODO: Customize the e2e test suite with scenarios specific to your project. From 4bc43c45eeee51fc0a5899798b9dcfc4cda84c4a Mon Sep 17 00:00:00 2001 From: 0x5457 <0x5457@protonmail.com> Date: Fri, 6 Dec 2024 05:40:52 +0000 Subject: [PATCH 14/22] fix tests --- internal/webhook/v1/pod_webhook_test.go | 7 ++----- internal/webhook/v1/webhook_suite_test.go | 4 +++- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/internal/webhook/v1/pod_webhook_test.go b/internal/webhook/v1/pod_webhook_test.go index 1fb7a60..44e1981 100644 --- a/internal/webhook/v1/pod_webhook_test.go +++ b/internal/webhook/v1/pod_webhook_test.go @@ -26,16 +26,13 @@ import ( var _ = Describe("Pod Webhook", func() { var ( - obj *corev1.Pod - oldObj *corev1.Pod - defaulter PodCustomDefaulter + obj *corev1.Pod + oldObj *corev1.Pod ) BeforeEach(func() { obj = &corev1.Pod{} oldObj = &corev1.Pod{} - defaulter = PodCustomDefaulter{} - Expect(defaulter).NotTo(BeNil(), "Expected defaulter to be initialized") Expect(oldObj).NotTo(BeNil(), "Expected oldObj to be initialized") Expect(obj).NotTo(BeNil(), "Expected obj to be initialized") // TODO (user): Add any setup logic common to all tests diff --git a/internal/webhook/v1/webhook_suite_test.go b/internal/webhook/v1/webhook_suite_test.go index 2ab08ea..119c947 100644 --- a/internal/webhook/v1/webhook_suite_test.go +++ b/internal/webhook/v1/webhook_suite_test.go @@ -26,6 +26,7 @@ import ( "testing" "time" + "github.com/NexusGPU/tensor-fusion-operator/internal/config" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -117,7 +118,8 @@ var _ = BeforeSuite(func() { }) Expect(err).NotTo(HaveOccurred()) - err = SetupPodWebhookWithManager(mgr) + conf := config.NewDefaultConfig() + err = SetupPodWebhookWithManager(mgr, &conf.PodMutator) Expect(err).NotTo(HaveOccurred()) // +kubebuilder:scaffold:webhook From 4d3bc2df69208bb2b756af5e99739eb09c64d74b Mon Sep 17 00:00:00 2001 From: 0x5457 <0x5457@protonmail.com> Date: Fri, 6 Dec 2024 08:15:51 +0000 Subject: [PATCH 15/22] feat(scheduler): implement naive scheduler release with resource tracking - Update Release method in naive scheduler to properly track and restore resources - Add requeue duration for pending connections - Fix status updates in TensorFusionConnection controller - Update tests to verify partial and full resource releases - Make QosClass optional in TensorFusionConnection status --- api/v1/tensorfusionconnection_types.go | 2 +- cmd/main.go | 7 +- ...sor-fusion.ai_tensorfusionconnections.yaml | 1 - config/manager/kustomization.yaml | 2 + config/samples/v1_gpunode.yaml | 12 +- config/samples/v1_tensorfusionconnection.yaml | 9 +- config/webhook/kustomization.yaml | 3 + go.mod | 3 +- internal/constants/constants.go | 4 + .../tensorfusionconnection_controller.go | 20 +- internal/scheduler/naive.go | 9 +- internal/scheduler/naive_test.go | 71 ++++++-- internal/scheduler/scheduler.go | 2 +- internal/webhook/v1/pod_webhook_test.go | 171 +++++++++++++++--- 14 files changed, 259 insertions(+), 57 deletions(-) diff --git a/api/v1/tensorfusionconnection_types.go b/api/v1/tensorfusionconnection_types.go index 00af5f2..5594e92 100644 --- a/api/v1/tensorfusionconnection_types.go +++ b/api/v1/tensorfusionconnection_types.go @@ -48,7 +48,7 @@ const ( type TensorFusionConnectionStatus struct { Phase TensorFusionConnectionPhase `json:"phase"` ConnectionURL string `json:"connectionURL"` - QosClass string `json:"qosClass"` + QosClass string `json:"qosClass,omitempty"` Node string `json:"node,omitempty"` } diff --git a/cmd/main.go b/cmd/main.go index 3ed420c..eb3e6a7 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -152,15 +152,16 @@ func main() { ctx := context.Background() config := config.NewDefaultConfig() + scheduler := scheduler.NewNaiveScheduler() if err = (&controller.TensorFusionConnectionReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Scheduler: scheduler, }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "TensorFusionConnection") os.Exit(1) } - scheduler := scheduler.NewNaiveScheduler() if err = (&controller.GPUNodeReconciler{ Client: mgr.GetClient(), Scheme: mgr.GetScheme(), diff --git a/config/crd/bases/tensor-fusion.ai_tensorfusionconnections.yaml b/config/crd/bases/tensor-fusion.ai_tensorfusionconnections.yaml index 9fb2714..43407d0 100644 --- a/config/crd/bases/tensor-fusion.ai_tensorfusionconnections.yaml +++ b/config/crd/bases/tensor-fusion.ai_tensorfusionconnections.yaml @@ -100,7 +100,6 @@ spec: required: - connectionURL - phase - - qosClass type: object type: object served: true diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml index 5c5f0b8..be49923 100644 --- a/config/manager/kustomization.yaml +++ b/config/manager/kustomization.yaml @@ -1,2 +1,4 @@ +namespace: tensor-fusion + resources: - manager.yaml diff --git a/config/samples/v1_gpunode.yaml b/config/samples/v1_gpunode.yaml index 0a5d491..484525d 100644 --- a/config/samples/v1_gpunode.yaml +++ b/config/samples/v1_gpunode.yaml @@ -5,5 +5,13 @@ metadata: app.kubernetes.io/name: tensor-fusion-operator app.kubernetes.io/managed-by: kustomize name: gpunode-sample -spec: - # TODO(user): Add fields here + namespace: tensor-fusion +status: + capacity: + tflops: '200' + vram: 100Gi + available: + tflops: '200' + vram: 100Gi + devices: [] + \ No newline at end of file diff --git a/config/samples/v1_tensorfusionconnection.yaml b/config/samples/v1_tensorfusionconnection.yaml index 3eb2690..cc634ff 100644 --- a/config/samples/v1_tensorfusionconnection.yaml +++ b/config/samples/v1_tensorfusionconnection.yaml @@ -5,5 +5,12 @@ metadata: app.kubernetes.io/name: tensor-fusion-operator app.kubernetes.io/managed-by: kustomize name: tensorfusionconnection-sample + namespace: tensor-fusion spec: - # TODO(user): Add fields here + resources: + limit: + tflops: '100' + vram: 8Gi + request: + tflops: '20' + vram: 9Gi diff --git a/config/webhook/kustomization.yaml b/config/webhook/kustomization.yaml index 9cf2613..3535f61 100644 --- a/config/webhook/kustomization.yaml +++ b/config/webhook/kustomization.yaml @@ -1,6 +1,9 @@ +namespace: tensor-fusion + resources: - manifests.yaml - service.yaml configurations: - kustomizeconfig.yaml + diff --git a/go.mod b/go.mod index deab21d..bbffb49 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ require ( github.com/gin-gonic/gin v1.10.0 github.com/onsi/ginkgo/v2 v2.19.0 github.com/onsi/gomega v1.33.1 + gomodules.xyz/jsonpatch/v2 v2.4.0 k8s.io/api v0.31.0 k8s.io/apimachinery v0.31.0 k8s.io/client-go v0.31.0 @@ -97,11 +98,11 @@ require ( golang.org/x/text v0.16.0 // indirect golang.org/x/time v0.3.0 // indirect golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d // indirect - gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20240528184218-531527333157 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20240701130421-f6361c86f094 // indirect google.golang.org/grpc v1.65.0 // indirect google.golang.org/protobuf v1.34.2 // indirect + gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect diff --git a/internal/constants/constants.go b/internal/constants/constants.go index acf7eee..130a29d 100644 --- a/internal/constants/constants.go +++ b/internal/constants/constants.go @@ -1,5 +1,7 @@ package constants +import "time" + const ( // TensorFusionDomain is the domain prefix used for all tensor-fusion.ai related annotations and finalizers TensorFusionDomain = "tensor-fusion.ai" @@ -12,4 +14,6 @@ const ( EnableContainerAnnotationFormat = TensorFusionDomain + "/enable-%s" TFLOPSContainerAnnotationFormat = TensorFusionDomain + "/tflops-%s" VRAMContainerAnnotationFormat = TensorFusionDomain + "/vram-%s" + + PendingRequeueDuration = time.Second * 3 ) diff --git a/internal/controller/tensorfusionconnection_controller.go b/internal/controller/tensorfusionconnection_controller.go index d546459..47c6397 100644 --- a/internal/controller/tensorfusionconnection_controller.go +++ b/internal/controller/tensorfusionconnection_controller.go @@ -96,14 +96,16 @@ func (r *TensorFusionConnectionReconciler) Reconcile(ctx context.Context, req ct // If status is not set or pending, try to schedule if connection.Status.Phase == "" || connection.Status.Phase == tfv1.TensorFusionConnectionPending { // Try to get an available node from scheduler - node, err := r.Scheduler.Schedule(connection.Spec.Resources.Request) + var err error + node, err = r.Scheduler.Schedule(connection.Spec.Resources.Request) if err != nil { - log.Error(err, "Failed to schedule connection") + log.Info(err.Error()) connection.Status.Phase = tfv1.TensorFusionConnectionPending } else if node != nil { connection.Status.Phase = tfv1.TensorFusionConnectionRunning connection.Status.ConnectionURL = worker.GenerateConnectionURL(node, connection) - connection.Status.Node = node.Name // Store the node name for cleanup + // Store the node name for cleanup + connection.Status.Node = node.Name } else { connection.Status.Phase = tfv1.TensorFusionConnectionPending } @@ -113,6 +115,9 @@ func (r *TensorFusionConnectionReconciler) Reconcile(ctx context.Context, req ct return ctrl.Result{}, err } + if connection.Status.Phase == tfv1.TensorFusionConnectionPending { + return ctrl.Result{RequeueAfter: constants.PendingRequeueDuration}, nil + } return ctrl.Result{}, nil } @@ -124,7 +129,7 @@ func (r *TensorFusionConnectionReconciler) handleDeletion(ctx context.Context, c // Get the node node := &tfv1.GPUNode{} - if err := r.Get(ctx, client.ObjectKey{Name: connection.Status.Node}, node); err != nil { + if err := r.Get(ctx, client.ObjectKey{Name: connection.Status.Node, Namespace: connection.Namespace}, node); err != nil { if errors.IsNotFound(err) { // Node is already gone, nothing to do return nil @@ -133,11 +138,11 @@ func (r *TensorFusionConnectionReconciler) handleDeletion(ctx context.Context, c } // Release the resources - if err := r.Scheduler.Release(node); err != nil { + if err := r.Scheduler.Release(connection.Spec.Resources.Request, node); err != nil { return err } - return nil + return r.MustUpdateStatus(ctx, connection, node) } // Helper functions to handle finalizers @@ -172,8 +177,7 @@ func (r *TensorFusionConnectionReconciler) MustUpdateStatus(ctx context.Context, } // Update the status fields we care about - latestConnection.Status.Phase = connection.Status.Phase - latestConnection.Status.ConnectionURL = connection.Status.ConnectionURL + latestConnection.Status = connection.Status // Update the connection status if err := r.Status().Update(ctx, latestConnection); err != nil { diff --git a/internal/scheduler/naive.go b/internal/scheduler/naive.go index 0192173..2af423e 100644 --- a/internal/scheduler/naive.go +++ b/internal/scheduler/naive.go @@ -60,7 +60,7 @@ func (s *NaiveScheduler) OnDelete(node *tfv1.GPUNode) { } // Release implements Scheduler interface -func (s *NaiveScheduler) Release(node *tfv1.GPUNode) error { +func (s *NaiveScheduler) Release(request tfv1.Resource, node *tfv1.GPUNode) error { s.Lock() defer s.Unlock() @@ -69,7 +69,10 @@ func (s *NaiveScheduler) Release(node *tfv1.GPUNode) error { return fmt.Errorf("node %s not found", node.Name) } - // Reset the node's available resources to its capacity - existingNode.Status.Available = existingNode.Status.Capacity + // Add back the released resources + existingNode.Status.Available.Tflops.Add(request.Tflops) + existingNode.Status.Available.Vram.Add(request.Vram) + // output the updated node + node.Status.Available = existingNode.Status.Available return nil } diff --git a/internal/scheduler/naive_test.go b/internal/scheduler/naive_test.go index 823f8c4..3cdfea4 100644 --- a/internal/scheduler/naive_test.go +++ b/internal/scheduler/naive_test.go @@ -168,14 +168,18 @@ func TestNaiveScheduler_NodeOperations(t *testing.T) { func TestNaiveScheduler_Release(t *testing.T) { tests := []struct { - name string - node *tfv1.GPUNode - schedule *tfv1.Resource - wantError bool + name string + node *tfv1.GPUNode + schedule *tfv1.Resource + release *tfv1.Resource + wantError bool + wantRemainingTflops string + wantRemainingVram string }{ { name: "release non-existent node", node: createGPUNode("node1", "100", "16Gi"), + release: &tfv1.Resource{}, wantError: true, }, { @@ -199,7 +203,42 @@ func TestNaiveScheduler_Release(t *testing.T) { Tflops: resource.MustParse("50"), Vram: resource.MustParse("8Gi"), }, - wantError: false, + release: &tfv1.Resource{ + Tflops: resource.MustParse("50"), + Vram: resource.MustParse("8Gi"), + }, + wantError: false, + wantRemainingTflops: "100", + wantRemainingVram: "16Gi", + }, + { + name: "partial release", + node: &tfv1.GPUNode{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + }, + Status: tfv1.GPUNodeStatus{ + Capacity: tfv1.Resource{ + Tflops: resource.MustParse("100"), + Vram: resource.MustParse("16Gi"), + }, + Available: tfv1.Resource{ + Tflops: resource.MustParse("100"), + Vram: resource.MustParse("16Gi"), + }, + }, + }, + schedule: &tfv1.Resource{ + Tflops: resource.MustParse("60"), + Vram: resource.MustParse("10Gi"), + }, + release: &tfv1.Resource{ + Tflops: resource.MustParse("30"), + Vram: resource.MustParse("5Gi"), + }, + wantError: false, + wantRemainingTflops: "70", + wantRemainingVram: "11Gi", }, } @@ -220,26 +259,34 @@ func TestNaiveScheduler_Release(t *testing.T) { } // Verify resources were allocated - if node.Status.Available.Tflops.Cmp(resource.MustParse("50")) != 0 || - node.Status.Available.Vram.Cmp(resource.MustParse("8Gi")) != 0 { + expectedTflops := tt.node.Status.Capacity.Tflops.DeepCopy() + expectedVram := tt.node.Status.Capacity.Vram.DeepCopy() + expectedTflops.Sub(tt.schedule.Tflops) + expectedVram.Sub(tt.schedule.Vram) + if node.Status.Available.Tflops.Cmp(expectedTflops) != 0 || + node.Status.Available.Vram.Cmp(expectedVram) != 0 { t.Errorf("Schedule() did not allocate resources correctly") return } } } - err := s.Release(tt.node) + err := s.Release(*tt.release, tt.node) if (err != nil) != tt.wantError { t.Errorf("Release() error = %v, wantError %v", err, tt.wantError) return } if !tt.wantError { - // Verify resources were restored + // Verify resources were restored correctly node := s.nodes[tt.node.Name] - if node.Status.Available.Tflops.Cmp(node.Status.Capacity.Tflops) != 0 || - node.Status.Available.Vram.Cmp(node.Status.Capacity.Vram) != 0 { - t.Errorf("Release() did not restore resources correctly") + if node.Status.Available.Tflops.String() != tt.wantRemainingTflops || + node.Status.Available.Vram.String() != tt.wantRemainingVram { + t.Errorf("Release() resources incorrect, got tflops=%v vram=%v, want tflops=%v vram=%v", + node.Status.Available.Tflops.String(), + node.Status.Available.Vram.String(), + tt.wantRemainingTflops, + tt.wantRemainingVram) } } }) diff --git a/internal/scheduler/scheduler.go b/internal/scheduler/scheduler.go index 163124c..bd18893 100644 --- a/internal/scheduler/scheduler.go +++ b/internal/scheduler/scheduler.go @@ -12,7 +12,7 @@ type Scheduler interface { Schedule(request tfv1.Resource) (*tfv1.GPUNode, error) // Release frees the allocated resources of a node - Release(node *tfv1.GPUNode) error + Release(request tfv1.Resource, node *tfv1.GPUNode) error // OnAdd is called when a new node is added OnAdd(node *tfv1.GPUNode) diff --git a/internal/webhook/v1/pod_webhook_test.go b/internal/webhook/v1/pod_webhook_test.go index 44e1981..87eaa79 100644 --- a/internal/webhook/v1/pod_webhook_test.go +++ b/internal/webhook/v1/pod_webhook_test.go @@ -17,42 +17,165 @@ limitations under the License. package v1 import ( + "context" + "encoding/json" + "net/http" + + tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1" + "github.com/NexusGPU/tensor-fusion-operator/internal/config" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - + admissionv1 "k8s.io/api/admission/v1" corev1 "k8s.io/api/core/v1" - // TODO (user): Add any additional imports if needed + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/webhook/admission" ) -var _ = Describe("Pod Webhook", func() { +var _ = Describe("TensorFusionPodMutator", func() { var ( - obj *corev1.Pod - oldObj *corev1.Pod + mutator *TensorFusionPodMutator + ctx context.Context + scheme *runtime.Scheme + decoder admission.Decoder + client client.Client ) BeforeEach(func() { - obj = &corev1.Pod{} - oldObj = &corev1.Pod{} - Expect(oldObj).NotTo(BeNil(), "Expected oldObj to be initialized") - Expect(obj).NotTo(BeNil(), "Expected obj to be initialized") - // TODO (user): Add any setup logic common to all tests - }) + ctx = context.Background() + scheme = runtime.NewScheme() + Expect(corev1.AddToScheme(scheme)).To(Succeed()) + Expect(tfv1.AddToScheme(scheme)).To(Succeed()) + + decoder = admission.NewDecoder(scheme) + client = fake.NewClientBuilder().WithScheme(scheme).Build() - AfterEach(func() { - // TODO (user): Add any teardown logic common to all tests + config := config.NewDefaultConfig() + mutator = &TensorFusionPodMutator{ + Client: client, + Config: &config.PodMutator, + } + Expect(mutator.InjectDecoder(decoder)).To(Succeed()) }) - Context("When creating Pod under Defaulting Webhook", func() { - // TODO (user): Add logic for defaulting webhooks - // Example: - // It("Should apply defaults when a required field is empty", func() { - // By("simulating a scenario where defaults should be applied") - // obj.SomeFieldWithDefault = "" - // By("calling the Default method to apply defaults") - // defaulter.Default(ctx, obj) - // By("checking that the default values are set") - // Expect(obj.SomeFieldWithDefault).To(Equal("default_value")) - // }) + Context("Handle", func() { + It("should successfully mutate a pod with TF requirements", func() { + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod", + Namespace: "default", + Annotations: map[string]string{ + "tf.nexusgpu.com/tflops": "100", + "tf.nexusgpu.com/vram": "16Gi", + }, + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "main", + Image: "test-image", + }, + }, + }, + } + + podBytes, err := json.Marshal(pod) + Expect(err).NotTo(HaveOccurred()) + + req := admission.Request{ + AdmissionRequest: admissionv1.AdmissionRequest{ + Object: runtime.RawExtension{ + Raw: podBytes, + }, + Operation: admissionv1.Create, + }, + } + + resp := mutator.Handle(ctx, req) + Expect(resp.Allowed).To(BeTrue()) + Expect(resp.Patches).NotTo(BeEmpty()) + + // Verify TensorFusionConnection was created + tfConnList := &tfv1.TensorFusionConnectionList{} + err = client.List(ctx, tfConnList) + Expect(err).NotTo(HaveOccurred()) + Expect(tfConnList.Items).To(HaveLen(1)) + }) + + It("should handle pods without TF requirements", func() { + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod-no-tf", + Namespace: "default", + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "main", + Image: "test-image", + }, + }, + }, + } + + podBytes, err := json.Marshal(pod) + Expect(err).NotTo(HaveOccurred()) + + req := admission.Request{ + AdmissionRequest: admissionv1.AdmissionRequest{ + Object: runtime.RawExtension{ + Raw: podBytes, + }, + Operation: admissionv1.Create, + }, + } + + resp := mutator.Handle(ctx, req) + Expect(resp.Allowed).To(BeTrue()) + Expect(resp.Patches).To(BeEmpty()) + }) + + It("should handle invalid pod specification", func() { + req := admission.Request{ + AdmissionRequest: admissionv1.AdmissionRequest{ + Object: runtime.RawExtension{ + Raw: []byte("invalid json"), + }, + Operation: admissionv1.Create, + }, + } + + resp := mutator.Handle(ctx, req) + Expect(resp.Allowed).To(BeFalse()) + Expect(resp.Result.Code).To(Equal(int32(http.StatusBadRequest))) + }) }) + Context("parseTFReq", func() { + It("should correctly parse TF requirements from pod annotations", func() { + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + "tf.nexusgpu.com/tflops": "100", + "tf.nexusgpu.com/vram": "16Gi", + }, + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "test-container", + }, + }, + }, + } + + reqs := parseTFReq(pod) + Expect(reqs).To(HaveLen(1)) + Expect(reqs[0].ContainerName).To(Equal("test-container")) + Expect(reqs[0].Tflops.String()).To(Equal("100")) + Expect(reqs[0].Vram.String()).To(Equal("16Gi")) + }) + }) }) From 107f4f987b8c1773da1db326e653af81b9d4b789 Mon Sep 17 00:00:00 2001 From: 0x5457 <0x5457@protonmail.com> Date: Fri, 6 Dec 2024 08:31:23 +0000 Subject: [PATCH 16/22] feat(server): add connection URL response and logging middleware - Add direct URL response for running connections - Enable gin logging middleware for better request tracking - Improve connection status handling --- internal/server/router/connection.go | 5 +++++ internal/server/server.go | 2 ++ 2 files changed, 7 insertions(+) diff --git a/internal/server/router/connection.go b/internal/server/router/connection.go index 1abad0e..678c685 100644 --- a/internal/server/router/connection.go +++ b/internal/server/router/connection.go @@ -35,6 +35,11 @@ func (cr *ConnectionRouter) Get(ctx *gin.Context) { return } + if conn.Status.Phase == tfv1.TensorFusionConnectionRunning { + ctx.JSON(200, conn.Status.ConnectionURL) + return + } + // Subscribe to connection updates ch, cancelFunc := cr.watcher.subscribe(req) defer cancelFunc() diff --git a/internal/server/server.go b/internal/server/server.go index fa2995c..3131b47 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -9,9 +9,11 @@ import ( func NewHTTPServer( cr *router.ConnectionRouter, ) *gin.Engine { + r := gin.New() r.Use(gzip.Gzip(gzip.DefaultCompression)) r.Use(gin.Recovery()) + r.Use(gin.Logger()) apiGroup := r.Group("/api") apiGroup.GET("/connection", cr.Get) From 345580f700a41b35ce90f63981a84b2969097e07 Mon Sep 17 00:00:00 2001 From: 0x5457 <0x5457@protonmail.com> Date: Fri, 6 Dec 2024 08:49:58 +0000 Subject: [PATCH 17/22] feat: change Gpunode to clusterlevel resource --- PROJECT | 1 - api/v1/gpunode_types.go | 2 +- config/crd/bases/tensor-fusion.ai_gpunodes.yaml | 2 +- config/samples/v1_gpunode.yaml | 1 - internal/controller/tensorfusionconnection_controller.go | 2 +- 5 files changed, 3 insertions(+), 5 deletions(-) diff --git a/PROJECT b/PROJECT index cc6a157..9104511 100644 --- a/PROJECT +++ b/PROJECT @@ -18,7 +18,6 @@ resources: version: v1 - api: crdVersion: v1 - namespaced: true controller: true domain: tensor-fusion.ai kind: GPUNode diff --git a/api/v1/gpunode_types.go b/api/v1/gpunode_types.go index 6a752ba..18ab7f1 100644 --- a/api/v1/gpunode_types.go +++ b/api/v1/gpunode_types.go @@ -29,7 +29,7 @@ type GPUNodeStatus struct { // +kubebuilder:object:root=true // +kubebuilder:subresource:status - +// +kubebuilder:resource:scope=Cluster // GPUNode is the Schema for the gpunodes API. type GPUNode struct { metav1.TypeMeta `json:",inline"` diff --git a/config/crd/bases/tensor-fusion.ai_gpunodes.yaml b/config/crd/bases/tensor-fusion.ai_gpunodes.yaml index 34442aa..bb62054 100644 --- a/config/crd/bases/tensor-fusion.ai_gpunodes.yaml +++ b/config/crd/bases/tensor-fusion.ai_gpunodes.yaml @@ -12,7 +12,7 @@ spec: listKind: GPUNodeList plural: gpunodes singular: gpunode - scope: Namespaced + scope: Cluster versions: - name: v1 schema: diff --git a/config/samples/v1_gpunode.yaml b/config/samples/v1_gpunode.yaml index 484525d..84a50cb 100644 --- a/config/samples/v1_gpunode.yaml +++ b/config/samples/v1_gpunode.yaml @@ -5,7 +5,6 @@ metadata: app.kubernetes.io/name: tensor-fusion-operator app.kubernetes.io/managed-by: kustomize name: gpunode-sample - namespace: tensor-fusion status: capacity: tflops: '200' diff --git a/internal/controller/tensorfusionconnection_controller.go b/internal/controller/tensorfusionconnection_controller.go index 47c6397..e631a55 100644 --- a/internal/controller/tensorfusionconnection_controller.go +++ b/internal/controller/tensorfusionconnection_controller.go @@ -129,7 +129,7 @@ func (r *TensorFusionConnectionReconciler) handleDeletion(ctx context.Context, c // Get the node node := &tfv1.GPUNode{} - if err := r.Get(ctx, client.ObjectKey{Name: connection.Status.Node, Namespace: connection.Namespace}, node); err != nil { + if err := r.Get(ctx, client.ObjectKey{Name: connection.Status.Node}, node); err != nil { if errors.IsNotFound(err) { // Node is already gone, nothing to do return nil From 2935d3dded975825895fba062f0c2521bb094ae6 Mon Sep 17 00:00:00 2001 From: 0x5457 <0x5457@protonmail.com> Date: Mon, 9 Dec 2024 07:35:52 +0000 Subject: [PATCH 18/22] chore: change Resource struct field names from Request/Limit to Requests/Limits to align with Kubernetes resource naming conventions. --- api/v1/tensorfusionconnection_types.go | 4 ++-- api/v1/zz_generated.deepcopy.go | 4 ++-- internal/controller/tensorfusionconnection_controller.go | 4 ++-- internal/webhook/v1/pod_webhook.go | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/api/v1/tensorfusionconnection_types.go b/api/v1/tensorfusionconnection_types.go index 5594e92..c79bfa4 100644 --- a/api/v1/tensorfusionconnection_types.go +++ b/api/v1/tensorfusionconnection_types.go @@ -27,8 +27,8 @@ type Resource struct { } type Resources struct { - Request Resource `json:"request"` - Limit Resource `json:"limit"` + Requests Resource `json:"requests"` + Limits Resource `json:"limits"` } // TensorFusionConnectionSpec defines the desired state of TensorFusionConnection. diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go index d899ad7..8ff9bbe 100644 --- a/api/v1/zz_generated.deepcopy.go +++ b/api/v1/zz_generated.deepcopy.go @@ -124,8 +124,8 @@ func (in *Resource) DeepCopy() *Resource { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Resources) DeepCopyInto(out *Resources) { *out = *in - in.Request.DeepCopyInto(&out.Request) - in.Limit.DeepCopyInto(&out.Limit) + in.Requests.DeepCopyInto(&out.Requests) + in.Limits.DeepCopyInto(&out.Limits) } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Resources. diff --git a/internal/controller/tensorfusionconnection_controller.go b/internal/controller/tensorfusionconnection_controller.go index e631a55..65937b1 100644 --- a/internal/controller/tensorfusionconnection_controller.go +++ b/internal/controller/tensorfusionconnection_controller.go @@ -97,7 +97,7 @@ func (r *TensorFusionConnectionReconciler) Reconcile(ctx context.Context, req ct if connection.Status.Phase == "" || connection.Status.Phase == tfv1.TensorFusionConnectionPending { // Try to get an available node from scheduler var err error - node, err = r.Scheduler.Schedule(connection.Spec.Resources.Request) + node, err = r.Scheduler.Schedule(connection.Spec.Resources.Requests) if err != nil { log.Info(err.Error()) connection.Status.Phase = tfv1.TensorFusionConnectionPending @@ -138,7 +138,7 @@ func (r *TensorFusionConnectionReconciler) handleDeletion(ctx context.Context, c } // Release the resources - if err := r.Scheduler.Release(connection.Spec.Resources.Request, node); err != nil { + if err := r.Scheduler.Release(connection.Spec.Resources.Requests, node); err != nil { return err } diff --git a/internal/webhook/v1/pod_webhook.go b/internal/webhook/v1/pod_webhook.go index 8edcb70..2b9bfcf 100644 --- a/internal/webhook/v1/pod_webhook.go +++ b/internal/webhook/v1/pod_webhook.go @@ -207,11 +207,11 @@ func generateTensorFusionConnection(pod *corev1.Pod, tfReq []TFReq) []*tfv1.Tens }, Spec: tfv1.TensorFusionConnectionSpec{ Resources: tfv1.Resources{ - Request: tfv1.Resource{ + Requests: tfv1.Resource{ Tflops: req.Tflops, Vram: req.Vram, }, - Limit: tfv1.Resource{ + Limits: tfv1.Resource{ Tflops: req.Tflops, Vram: req.Vram, }, From ade38f8781cbb88282f528c72d15b63b974172f0 Mon Sep 17 00:00:00 2001 From: 0x5457 <0x5457@protonmail.com> Date: Mon, 9 Dec 2024 07:50:13 +0000 Subject: [PATCH 19/22] feat: rename from 'GPUNode' to 'GPU' --- PROJECT | 2 +- api/v1/{gpunode_types.go => gpu_types.go} | 19 ++- api/v1/tensorfusionconnection_types.go | 2 +- api/v1/zz_generated.deepcopy.go | 35 ++--- cmd/main.go | 4 +- ...unodes.yaml => tensor-fusion.ai_gpus.yaml} | 19 +-- ...sor-fusion.ai_tensorfusionconnections.yaml | 8 +- config/crd/kustomization.yaml | 2 +- ..._editor_role.yaml => gpu_editor_role.yaml} | 8 +- ..._viewer_role.yaml => gpu_viewer_role.yaml} | 8 +- config/rbac/kustomization.yaml | 4 +- config/rbac/role.yaml | 6 +- config/samples/kustomization.yaml | 2 +- .../samples/{v1_gpunode.yaml => v1_gpu.yaml} | 4 +- config/samples/v1_tensorfusionconnection.yaml | 4 +- ...punode_controller.go => gpu_controller.go} | 24 ++-- ...troller_test.go => gpu_controller_test.go} | 19 ++- .../tensorfusionconnection_controller.go | 50 +++---- internal/scheduler/naive.go | 52 +++---- internal/scheduler/naive_test.go | 128 +++++++++--------- internal/scheduler/scheduler.go | 24 ++-- internal/worker/worker.go | 2 +- 22 files changed, 206 insertions(+), 220 deletions(-) rename api/v1/{gpunode_types.go => gpu_types.go} (71%) rename config/crd/bases/{tensor-fusion.ai_gpunodes.yaml => tensor-fusion.ai_gpus.yaml} (88%) rename config/rbac/{gpunode_editor_role.yaml => gpu_editor_role.yaml} (77%) rename config/rbac/{gpunode_viewer_role.yaml => gpu_viewer_role.yaml} (75%) rename config/samples/{v1_gpunode.yaml => v1_gpu.yaml} (87%) rename internal/controller/{gpunode_controller.go => gpu_controller.go} (67%) rename internal/controller/{gpunode_controller_test.go => gpu_controller_test.go} (84%) diff --git a/PROJECT b/PROJECT index 9104511..dfc0413 100644 --- a/PROJECT +++ b/PROJECT @@ -20,7 +20,7 @@ resources: crdVersion: v1 controller: true domain: tensor-fusion.ai - kind: GPUNode + kind: GPU path: github.com/NexusGPU/tensor-fusion-operator/api/v1 version: v1 - core: true diff --git a/api/v1/gpunode_types.go b/api/v1/gpu_types.go similarity index 71% rename from api/v1/gpunode_types.go rename to api/v1/gpu_types.go index 18ab7f1..9743024 100644 --- a/api/v1/gpunode_types.go +++ b/api/v1/gpu_types.go @@ -20,33 +20,32 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) -// GPUNodeStatus defines the observed state of GPUNode. -type GPUNodeStatus struct { +// GPUStatus defines the observed state of GPU. +type GPUStatus struct { Capacity Resource `json:"capacity"` Available Resource `json:"available"` - Devices []string `json:"devices"` } // +kubebuilder:object:root=true // +kubebuilder:subresource:status // +kubebuilder:resource:scope=Cluster -// GPUNode is the Schema for the gpunodes API. -type GPUNode struct { +// GPU is the Schema for the gpus API. +type GPU struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty"` - Status GPUNodeStatus `json:"status,omitempty"` + Status GPUStatus `json:"status,omitempty"` } // +kubebuilder:object:root=true -// GPUNodeList contains a list of GPUNode. -type GPUNodeList struct { +// GPUList contains a list of GPU. +type GPUList struct { metav1.TypeMeta `json:",inline"` metav1.ListMeta `json:"metadata,omitempty"` - Items []GPUNode `json:"items"` + Items []GPU `json:"items"` } func init() { - SchemeBuilder.Register(&GPUNode{}, &GPUNodeList{}) + SchemeBuilder.Register(&GPU{}, &GPUList{}) } diff --git a/api/v1/tensorfusionconnection_types.go b/api/v1/tensorfusionconnection_types.go index c79bfa4..c14eb66 100644 --- a/api/v1/tensorfusionconnection_types.go +++ b/api/v1/tensorfusionconnection_types.go @@ -49,7 +49,7 @@ type TensorFusionConnectionStatus struct { Phase TensorFusionConnectionPhase `json:"phase"` ConnectionURL string `json:"connectionURL"` QosClass string `json:"qosClass,omitempty"` - Node string `json:"node,omitempty"` + GPU string `json:"gpu,omitempty"` } // +kubebuilder:object:root=true diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go index 8ff9bbe..cf117f0 100644 --- a/api/v1/zz_generated.deepcopy.go +++ b/api/v1/zz_generated.deepcopy.go @@ -25,25 +25,25 @@ import ( ) // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *GPUNode) DeepCopyInto(out *GPUNode) { +func (in *GPU) DeepCopyInto(out *GPU) { *out = *in out.TypeMeta = in.TypeMeta in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) in.Status.DeepCopyInto(&out.Status) } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNode. -func (in *GPUNode) DeepCopy() *GPUNode { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPU. +func (in *GPU) DeepCopy() *GPU { if in == nil { return nil } - out := new(GPUNode) + out := new(GPU) in.DeepCopyInto(out) return out } // DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. -func (in *GPUNode) DeepCopyObject() runtime.Object { +func (in *GPU) DeepCopyObject() runtime.Object { if c := in.DeepCopy(); c != nil { return c } @@ -51,31 +51,31 @@ func (in *GPUNode) DeepCopyObject() runtime.Object { } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *GPUNodeList) DeepCopyInto(out *GPUNodeList) { +func (in *GPUList) DeepCopyInto(out *GPUList) { *out = *in out.TypeMeta = in.TypeMeta in.ListMeta.DeepCopyInto(&out.ListMeta) if in.Items != nil { in, out := &in.Items, &out.Items - *out = make([]GPUNode, len(*in)) + *out = make([]GPU, len(*in)) for i := range *in { (*in)[i].DeepCopyInto(&(*out)[i]) } } } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeList. -func (in *GPUNodeList) DeepCopy() *GPUNodeList { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUList. +func (in *GPUList) DeepCopy() *GPUList { if in == nil { return nil } - out := new(GPUNodeList) + out := new(GPUList) in.DeepCopyInto(out) return out } // DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. -func (in *GPUNodeList) DeepCopyObject() runtime.Object { +func (in *GPUList) DeepCopyObject() runtime.Object { if c := in.DeepCopy(); c != nil { return c } @@ -83,23 +83,18 @@ func (in *GPUNodeList) DeepCopyObject() runtime.Object { } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *GPUNodeStatus) DeepCopyInto(out *GPUNodeStatus) { +func (in *GPUStatus) DeepCopyInto(out *GPUStatus) { *out = *in in.Capacity.DeepCopyInto(&out.Capacity) in.Available.DeepCopyInto(&out.Available) - if in.Devices != nil { - in, out := &in.Devices, &out.Devices - *out = make([]string, len(*in)) - copy(*out, *in) - } } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeStatus. -func (in *GPUNodeStatus) DeepCopy() *GPUNodeStatus { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUStatus. +func (in *GPUStatus) DeepCopy() *GPUStatus { if in == nil { return nil } - out := new(GPUNodeStatus) + out := new(GPUStatus) in.DeepCopyInto(out) return out } diff --git a/cmd/main.go b/cmd/main.go index eb3e6a7..47055fe 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -162,12 +162,12 @@ func main() { os.Exit(1) } - if err = (&controller.GPUNodeReconciler{ + if err = (&controller.GPUReconciler{ Client: mgr.GetClient(), Scheme: mgr.GetScheme(), Scheduler: scheduler, }).SetupWithManager(ctx, mgr); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "GPUNode") + setupLog.Error(err, "unable to create controller", "controller", "GPU") os.Exit(1) } diff --git a/config/crd/bases/tensor-fusion.ai_gpunodes.yaml b/config/crd/bases/tensor-fusion.ai_gpus.yaml similarity index 88% rename from config/crd/bases/tensor-fusion.ai_gpunodes.yaml rename to config/crd/bases/tensor-fusion.ai_gpus.yaml index bb62054..38ee9de 100644 --- a/config/crd/bases/tensor-fusion.ai_gpunodes.yaml +++ b/config/crd/bases/tensor-fusion.ai_gpus.yaml @@ -4,20 +4,20 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.16.4 - name: gpunodes.tensor-fusion.ai + name: gpus.tensor-fusion.ai spec: group: tensor-fusion.ai names: - kind: GPUNode - listKind: GPUNodeList - plural: gpunodes - singular: gpunode + kind: GPU + listKind: GPUList + plural: gpus + singular: gpu scope: Cluster versions: - name: v1 schema: openAPIV3Schema: - description: GPUNode is the Schema for the gpunodes API. + description: GPU is the Schema for the gpus API. properties: apiVersion: description: |- @@ -37,7 +37,7 @@ spec: metadata: type: object status: - description: GPUNodeStatus defines the observed state of GPUNode. + description: GPUStatus defines the observed state of GPU. properties: available: properties: @@ -75,14 +75,9 @@ spec: - tflops - vram type: object - devices: - items: - type: string - type: array required: - available - capacity - - devices type: object type: object served: true diff --git a/config/crd/bases/tensor-fusion.ai_tensorfusionconnections.yaml b/config/crd/bases/tensor-fusion.ai_tensorfusionconnections.yaml index 43407d0..7b2b288 100644 --- a/config/crd/bases/tensor-fusion.ai_tensorfusionconnections.yaml +++ b/config/crd/bases/tensor-fusion.ai_tensorfusionconnections.yaml @@ -42,7 +42,7 @@ spec: properties: resources: properties: - limit: + limits: properties: tflops: anyOf: @@ -60,7 +60,7 @@ spec: - tflops - vram type: object - request: + requests: properties: tflops: anyOf: @@ -79,8 +79,8 @@ spec: - vram type: object required: - - limit - - request + - limits + - requests type: object required: - resources diff --git a/config/crd/kustomization.yaml b/config/crd/kustomization.yaml index ef965fc..7532d6a 100644 --- a/config/crd/kustomization.yaml +++ b/config/crd/kustomization.yaml @@ -3,7 +3,7 @@ # It should be run by config/default resources: - bases/tensor-fusion.ai_tensorfusionconnections.yaml -- bases/tensor-fusion.ai_gpunodes.yaml +- bases/tensor-fusion.ai_gpus.yaml # +kubebuilder:scaffold:crdkustomizeresource patches: diff --git a/config/rbac/gpunode_editor_role.yaml b/config/rbac/gpu_editor_role.yaml similarity index 77% rename from config/rbac/gpunode_editor_role.yaml rename to config/rbac/gpu_editor_role.yaml index 10e6ec1..e512234 100644 --- a/config/rbac/gpunode_editor_role.yaml +++ b/config/rbac/gpu_editor_role.yaml @@ -1,16 +1,16 @@ -# permissions for end users to edit gpunodes. +# permissions for end users to edit gpus. apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: app.kubernetes.io/name: tensor-fusion-operator app.kubernetes.io/managed-by: kustomize - name: gpunode-editor-role + name: gpu-editor-role rules: - apiGroups: - tensor-fusion.ai resources: - - gpunodes + - gpus verbs: - create - delete @@ -22,6 +22,6 @@ rules: - apiGroups: - tensor-fusion.ai resources: - - gpunodes/status + - gpus/status verbs: - get diff --git a/config/rbac/gpunode_viewer_role.yaml b/config/rbac/gpu_viewer_role.yaml similarity index 75% rename from config/rbac/gpunode_viewer_role.yaml rename to config/rbac/gpu_viewer_role.yaml index 376b12f..0a45f0c 100644 --- a/config/rbac/gpunode_viewer_role.yaml +++ b/config/rbac/gpu_viewer_role.yaml @@ -1,16 +1,16 @@ -# permissions for end users to view gpunodes. +# permissions for end users to view gpus. apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: app.kubernetes.io/name: tensor-fusion-operator app.kubernetes.io/managed-by: kustomize - name: gpunode-viewer-role + name: gpu-viewer-role rules: - apiGroups: - tensor-fusion.ai resources: - - gpunodes + - gpus verbs: - get - list @@ -18,6 +18,6 @@ rules: - apiGroups: - tensor-fusion.ai resources: - - gpunodes/status + - gpus/status verbs: - get diff --git a/config/rbac/kustomization.yaml b/config/rbac/kustomization.yaml index 0bb7cfe..c737ba7 100644 --- a/config/rbac/kustomization.yaml +++ b/config/rbac/kustomization.yaml @@ -22,8 +22,8 @@ resources: # default, aiding admins in cluster management. Those roles are # not used by the Project itself. You can comment the following lines # if you do not want those helpers be installed with your Project. -- gpunode_editor_role.yaml -- gpunode_viewer_role.yaml +- gpu_editor_role.yaml +- gpu_viewer_role.yaml - tensorfusionconnection_editor_role.yaml - tensorfusionconnection_viewer_role.yaml diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index a2a838e..95a7559 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -7,7 +7,7 @@ rules: - apiGroups: - tensor-fusion.ai resources: - - gpunodes + - gpus - tensorfusionconnections verbs: - create @@ -20,14 +20,14 @@ rules: - apiGroups: - tensor-fusion.ai resources: - - gpunodes/finalizers + - gpus/finalizers - tensorfusionconnections/finalizers verbs: - update - apiGroups: - tensor-fusion.ai resources: - - gpunodes/status + - gpus/status - tensorfusionconnections/status verbs: - get diff --git a/config/samples/kustomization.yaml b/config/samples/kustomization.yaml index 387e950..7b7b116 100644 --- a/config/samples/kustomization.yaml +++ b/config/samples/kustomization.yaml @@ -1,5 +1,5 @@ ## Append samples of your project ## resources: - v1_tensorfusionconnection.yaml -- v1_gpunode.yaml +- v1_gpu.yaml # +kubebuilder:scaffold:manifestskustomizesamples diff --git a/config/samples/v1_gpunode.yaml b/config/samples/v1_gpu.yaml similarity index 87% rename from config/samples/v1_gpunode.yaml rename to config/samples/v1_gpu.yaml index 84a50cb..984bfb5 100644 --- a/config/samples/v1_gpunode.yaml +++ b/config/samples/v1_gpu.yaml @@ -1,10 +1,10 @@ apiVersion: tensor-fusion.ai/v1 -kind: GPUNode +kind: GPU metadata: labels: app.kubernetes.io/name: tensor-fusion-operator app.kubernetes.io/managed-by: kustomize - name: gpunode-sample + name: gpu-sample status: capacity: tflops: '200' diff --git a/config/samples/v1_tensorfusionconnection.yaml b/config/samples/v1_tensorfusionconnection.yaml index cc634ff..bf1072f 100644 --- a/config/samples/v1_tensorfusionconnection.yaml +++ b/config/samples/v1_tensorfusionconnection.yaml @@ -8,9 +8,9 @@ metadata: namespace: tensor-fusion spec: resources: - limit: + limits: tflops: '100' vram: 8Gi - request: + requests: tflops: '20' vram: 9Gi diff --git a/internal/controller/gpunode_controller.go b/internal/controller/gpu_controller.go similarity index 67% rename from internal/controller/gpunode_controller.go rename to internal/controller/gpu_controller.go index caea975..e6533c1 100644 --- a/internal/controller/gpunode_controller.go +++ b/internal/controller/gpu_controller.go @@ -29,41 +29,41 @@ import ( scheduler "github.com/NexusGPU/tensor-fusion-operator/internal/scheduler" ) -// GPUNodeReconciler reconciles a GPUNode object -type GPUNodeReconciler struct { +// GPUReconciler reconciles a GPU object +type GPUReconciler struct { client.Client Scheme *runtime.Scheme Scheduler scheduler.Scheduler } -// +kubebuilder:rbac:groups=tensor-fusion.ai,resources=gpunodes,verbs=get;list;watch;create;update;patch;delete -// +kubebuilder:rbac:groups=tensor-fusion.ai,resources=gpunodes/status,verbs=get;update;patch -// +kubebuilder:rbac:groups=tensor-fusion.ai,resources=gpunodes/finalizers,verbs=update +// +kubebuilder:rbac:groups=tensor-fusion.ai,resources=gpus,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=tensor-fusion.ai,resources=gpus/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=tensor-fusion.ai,resources=gpus/finalizers,verbs=update // Reconcile is part of the main kubernetes reconciliation loop which aims to // move the current state of the cluster closer to the desired state. -func (r *GPUNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { +func (r *GPUReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { // TODO: Calculate tflops and update capacity here return ctrl.Result{}, nil } // SetupWithManager sets up the controller with the Manager. -func (r *GPUNodeReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager) error { +func (r *GPUReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager) error { return ctrl.NewControllerManagedBy(mgr). - For(&tfv1.GPUNode{}). - Named("gpunode"). + For(&tfv1.GPU{}). + Named("gpu"). WithEventFilter( predicate.Funcs{ CreateFunc: func(e event.CreateEvent) bool { - r.Scheduler.OnAdd(e.Object.(*tfv1.GPUNode)) + r.Scheduler.OnAdd(e.Object.(*tfv1.GPU)) return true }, UpdateFunc: func(e event.UpdateEvent) bool { - r.Scheduler.OnUpdate(e.ObjectOld.(*tfv1.GPUNode), e.ObjectNew.(*tfv1.GPUNode)) + r.Scheduler.OnUpdate(e.ObjectOld.(*tfv1.GPU), e.ObjectNew.(*tfv1.GPU)) return true }, DeleteFunc: func(e event.DeleteEvent) bool { - r.Scheduler.OnDelete(e.Object.(*tfv1.GPUNode)) + r.Scheduler.OnDelete(e.Object.(*tfv1.GPU)) return true }, }, diff --git a/internal/controller/gpunode_controller_test.go b/internal/controller/gpu_controller_test.go similarity index 84% rename from internal/controller/gpunode_controller_test.go rename to internal/controller/gpu_controller_test.go index 8cf0c89..3742307 100644 --- a/internal/controller/gpunode_controller_test.go +++ b/internal/controller/gpu_controller_test.go @@ -22,15 +22,14 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/reconcile" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - tensorfusionaiv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1" ) -var _ = Describe("GPUNode Controller", func() { +var _ = Describe("GPU Controller", func() { Context("When reconciling a resource", func() { const resourceName = "test-resource" @@ -40,13 +39,13 @@ var _ = Describe("GPUNode Controller", func() { Name: resourceName, Namespace: "default", // TODO(user):Modify as needed } - gpunode := &tensorfusionaiv1.GPUNode{} + gpu := &tensorfusionaiv1.GPU{} BeforeEach(func() { - By("creating the custom resource for the Kind GPUNode") - err := k8sClient.Get(ctx, typeNamespacedName, gpunode) + By("creating the custom resource for the Kind GPU") + err := k8sClient.Get(ctx, typeNamespacedName, gpu) if err != nil && errors.IsNotFound(err) { - resource := &tensorfusionaiv1.GPUNode{ + resource := &tensorfusionaiv1.GPU{ ObjectMeta: metav1.ObjectMeta{ Name: resourceName, Namespace: "default", @@ -59,16 +58,16 @@ var _ = Describe("GPUNode Controller", func() { AfterEach(func() { // TODO(user): Cleanup logic after each test, like removing the resource instance. - resource := &tensorfusionaiv1.GPUNode{} + resource := &tensorfusionaiv1.GPU{} err := k8sClient.Get(ctx, typeNamespacedName, resource) Expect(err).NotTo(HaveOccurred()) - By("Cleanup the specific resource instance GPUNode") + By("Cleanup the specific resource instance GPU") Expect(k8sClient.Delete(ctx, resource)).To(Succeed()) }) It("should successfully reconcile the resource", func() { By("Reconciling the created resource") - controllerReconciler := &GPUNodeReconciler{ + controllerReconciler := &GPUReconciler{ Client: k8sClient, Scheme: k8sClient.Scheme(), } diff --git a/internal/controller/tensorfusionconnection_controller.go b/internal/controller/tensorfusionconnection_controller.go index 65937b1..6280a86 100644 --- a/internal/controller/tensorfusionconnection_controller.go +++ b/internal/controller/tensorfusionconnection_controller.go @@ -92,26 +92,26 @@ func (r *TensorFusionConnectionReconciler) Reconcile(ctx context.Context, req ct return ctrl.Result{}, nil } - var node *tfv1.GPUNode + var gpu *tfv1.GPU // If status is not set or pending, try to schedule if connection.Status.Phase == "" || connection.Status.Phase == tfv1.TensorFusionConnectionPending { - // Try to get an available node from scheduler + // Try to get an available gpu from scheduler var err error - node, err = r.Scheduler.Schedule(connection.Spec.Resources.Requests) + gpu, err = r.Scheduler.Schedule(connection.Spec.Resources.Requests) if err != nil { log.Info(err.Error()) connection.Status.Phase = tfv1.TensorFusionConnectionPending - } else if node != nil { + } else if gpu != nil { connection.Status.Phase = tfv1.TensorFusionConnectionRunning - connection.Status.ConnectionURL = worker.GenerateConnectionURL(node, connection) - // Store the node name for cleanup - connection.Status.Node = node.Name + connection.Status.ConnectionURL = worker.GenerateConnectionURL(gpu, connection) + // Store the gpu name for cleanup + connection.Status.GPU = gpu.Name } else { connection.Status.Phase = tfv1.TensorFusionConnectionPending } } - if err := r.MustUpdateStatus(ctx, connection, node); err != nil { + if err := r.MustUpdateStatus(ctx, connection, gpu); err != nil { return ctrl.Result{}, err } @@ -123,26 +123,26 @@ func (r *TensorFusionConnectionReconciler) Reconcile(ctx context.Context, req ct // handleDeletion handles cleanup of external dependencies func (r *TensorFusionConnectionReconciler) handleDeletion(ctx context.Context, connection *tfv1.TensorFusionConnection) error { - if connection.Status.Node == "" { - return nil // No node was allocated, nothing to clean up + if connection.Status.GPU == "" { + return nil // No gpu was allocated, nothing to clean up } - // Get the node - node := &tfv1.GPUNode{} - if err := r.Get(ctx, client.ObjectKey{Name: connection.Status.Node}, node); err != nil { + // Get the gpu + gpu := &tfv1.GPU{} + if err := r.Get(ctx, client.ObjectKey{Name: connection.Status.GPU}, gpu); err != nil { if errors.IsNotFound(err) { - // Node is already gone, nothing to do + // gpu is already gone, nothing to do return nil } return err } // Release the resources - if err := r.Scheduler.Release(connection.Spec.Resources.Requests, node); err != nil { + if err := r.Scheduler.Release(connection.Spec.Resources.Requests, gpu); err != nil { return err } - return r.MustUpdateStatus(ctx, connection, node) + return r.MustUpdateStatus(ctx, connection, gpu) } // Helper functions to handle finalizers @@ -165,7 +165,7 @@ func removeString(slice []string, s string) []string { return result } -func (r *TensorFusionConnectionReconciler) MustUpdateStatus(ctx context.Context, connection *tfv1.TensorFusionConnection, gpuNode *tfv1.GPUNode) error { +func (r *TensorFusionConnectionReconciler) MustUpdateStatus(ctx context.Context, connection *tfv1.TensorFusionConnection, gpu *tfv1.GPU) error { return retry.RetryOnConflict(retry.DefaultBackoff, func() error { // Get the latest version of the connection latestConnection := &tfv1.TensorFusionConnection{} @@ -184,20 +184,20 @@ func (r *TensorFusionConnectionReconciler) MustUpdateStatus(ctx context.Context, return err } - if gpuNode != nil { - // Get the latest version of the node - latestNode := &tfv1.GPUNode{} + if gpu != nil { + // Get the latest version of the gpu + latestgpu := &tfv1.GPU{} if err := r.Get(ctx, client.ObjectKey{ - Name: gpuNode.Name, - Namespace: gpuNode.Namespace, - }, latestNode); err != nil { + Name: gpu.Name, + Namespace: gpu.Namespace, + }, latestgpu); err != nil { return err } // Update the status fields we care about - latestNode.Status.Available = gpuNode.Status.Available - if err := r.Status().Update(ctx, latestNode); err != nil { + latestgpu.Status.Available = gpu.Status.Available + if err := r.Status().Update(ctx, latestgpu); err != nil { return err } } diff --git a/internal/scheduler/naive.go b/internal/scheduler/naive.go index 2af423e..7b2627e 100644 --- a/internal/scheduler/naive.go +++ b/internal/scheduler/naive.go @@ -9,70 +9,70 @@ import ( // NaiveScheduler implements a simple scheduling strategy type NaiveScheduler struct { - sync.Mutex - nodes map[string]*tfv1.GPUNode + sync.RWMutex + gpus map[string]*tfv1.GPU } // NewNaiveScheduler creates a new NaiveScheduler func NewNaiveScheduler() *NaiveScheduler { return &NaiveScheduler{ - nodes: make(map[string]*tfv1.GPUNode), + gpus: make(map[string]*tfv1.GPU), } } // Schedule implements Scheduler interface -func (s *NaiveScheduler) Schedule(request tfv1.Resource) (*tfv1.GPUNode, error) { +func (s *NaiveScheduler) Schedule(request tfv1.Resource) (*tfv1.GPU, error) { s.Lock() defer s.Unlock() - // Simple strategy: return the first node that has enough resources - for _, node := range s.nodes { - if node.Status.Available.Tflops.Cmp(request.Tflops) >= 0 && - node.Status.Available.Vram.Cmp(request.Vram) >= 0 { - // Update the node's available resources - node.Status.Available.Tflops.Sub(request.Tflops) - node.Status.Available.Vram.Sub(request.Vram) - return node, nil + // Simple strategy: return the first gpu that has enough resources + for _, gpu := range s.gpus { + if gpu.Status.Available.Tflops.Cmp(request.Tflops) >= 0 && + gpu.Status.Available.Vram.Cmp(request.Vram) >= 0 { + // Update the gpu's available resources + gpu.Status.Available.Tflops.Sub(request.Tflops) + gpu.Status.Available.Vram.Sub(request.Vram) + return gpu, nil } } - return nil, fmt.Errorf("no suitable node found for request: %v", request) + return nil, fmt.Errorf("no suitable gpu found for request: %v", request) } // OnAdd implements Scheduler interface -func (s *NaiveScheduler) OnAdd(node *tfv1.GPUNode) { +func (s *NaiveScheduler) OnAdd(gpu *tfv1.GPU) { s.Lock() defer s.Unlock() - s.nodes[node.Name] = node + s.gpus[gpu.Name] = gpu } // OnUpdate implements Scheduler interface -func (s *NaiveScheduler) OnUpdate(oldNode, newNode *tfv1.GPUNode) { +func (s *NaiveScheduler) OnUpdate(oldgpu, newgpu *tfv1.GPU) { s.Lock() defer s.Unlock() - s.nodes[newNode.Name] = newNode + s.gpus[newgpu.Name] = newgpu } // OnDelete implements Scheduler interface -func (s *NaiveScheduler) OnDelete(node *tfv1.GPUNode) { +func (s *NaiveScheduler) OnDelete(gpu *tfv1.GPU) { s.Lock() defer s.Unlock() - delete(s.nodes, node.Name) + delete(s.gpus, gpu.Name) } // Release implements Scheduler interface -func (s *NaiveScheduler) Release(request tfv1.Resource, node *tfv1.GPUNode) error { +func (s *NaiveScheduler) Release(request tfv1.Resource, gpu *tfv1.GPU) error { s.Lock() defer s.Unlock() - existingNode, ok := s.nodes[node.Name] + existinggpu, ok := s.gpus[gpu.Name] if !ok { - return fmt.Errorf("node %s not found", node.Name) + return fmt.Errorf("gpu %s not found", gpu.Name) } // Add back the released resources - existingNode.Status.Available.Tflops.Add(request.Tflops) - existingNode.Status.Available.Vram.Add(request.Vram) - // output the updated node - node.Status.Available = existingNode.Status.Available + existinggpu.Status.Available.Tflops.Add(request.Tflops) + existinggpu.Status.Available.Vram.Add(request.Vram) + // output the updated gpu + gpu.Status.Available = existinggpu.Status.Available return nil } diff --git a/internal/scheduler/naive_test.go b/internal/scheduler/naive_test.go index 3cdfea4..aac3b17 100644 --- a/internal/scheduler/naive_test.go +++ b/internal/scheduler/naive_test.go @@ -8,12 +8,12 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) -func createGPUNode(name string, tflops, vram string) *tfv1.GPUNode { - return &tfv1.GPUNode{ +func createGPU(name string, tflops, vram string) *tfv1.GPU { + return &tfv1.GPU{ ObjectMeta: metav1.ObjectMeta{ Name: name, }, - Status: tfv1.GPUNodeStatus{ + Status: tfv1.GPUStatus{ Available: tfv1.Resource{ Tflops: resource.MustParse(tflops), Vram: resource.MustParse(vram), @@ -33,59 +33,59 @@ func createRequest(tflops, vram string) tfv1.Resource { func TestNaiveScheduler_Schedule(t *testing.T) { tests := []struct { name string - nodes []*tfv1.GPUNode + gpus []*tfv1.GPU request tfv1.Resource - wantNode string + wantgpu string wantError bool wantRemainingTflops string wantRemainingVram string }{ { name: "simple match", - nodes: []*tfv1.GPUNode{ - createGPUNode("node1", "100", "16Gi"), + gpus: []*tfv1.GPU{ + createGPU("gpu1", "100", "16Gi"), }, request: createRequest("50", "8Gi"), - wantNode: "node1", + wantgpu: "gpu1", wantError: false, wantRemainingTflops: "50", wantRemainingVram: "8Gi", }, { - name: "no nodes", - nodes: []*tfv1.GPUNode{}, + name: "no gpus", + gpus: []*tfv1.GPU{}, request: createRequest("50", "8Gi"), - wantNode: "", + wantgpu: "", wantError: true, }, { name: "insufficient resources", - nodes: []*tfv1.GPUNode{ - createGPUNode("node1", "40", "16Gi"), + gpus: []*tfv1.GPU{ + createGPU("gpu1", "40", "16Gi"), }, request: createRequest("50", "8Gi"), - wantNode: "", + wantgpu: "", wantError: true, }, { - name: "multiple nodes, first fit", - nodes: []*tfv1.GPUNode{ - createGPUNode("node1", "40", "16Gi"), - createGPUNode("node2", "100", "32Gi"), + name: "multiple gpus, first fit", + gpus: []*tfv1.GPU{ + createGPU("gpu1", "40", "16Gi"), + createGPU("gpu2", "100", "32Gi"), }, request: createRequest("50", "8Gi"), - wantNode: "node2", + wantgpu: "gpu2", wantError: false, wantRemainingTflops: "50", wantRemainingVram: "24Gi", }, { name: "exact match", - nodes: []*tfv1.GPUNode{ - createGPUNode("node1", "50", "8Gi"), + gpus: []*tfv1.GPU{ + createGPU("gpu1", "50", "8Gi"), }, request: createRequest("50", "8Gi"), - wantNode: "node1", + wantgpu: "gpu1", wantError: false, wantRemainingTflops: "0", wantRemainingVram: "0", @@ -96,9 +96,9 @@ func TestNaiveScheduler_Schedule(t *testing.T) { t.Run(tt.name, func(t *testing.T) { s := NewNaiveScheduler() - // Add nodes - for _, node := range tt.nodes { - s.OnAdd(node) + // Add gpus + for _, gpu := range tt.gpus { + s.OnAdd(gpu) } // Try to schedule @@ -113,11 +113,11 @@ func TestNaiveScheduler_Schedule(t *testing.T) { // Check result if !tt.wantError { if got == nil { - t.Error("Schedule() returned nil node when error not expected") + t.Error("Schedule() returned nil gpu when error not expected") return } - if got.Name != tt.wantNode { - t.Errorf("Schedule() got node = %v, want %v", got.Name, tt.wantNode) + if got.Name != tt.wantgpu { + t.Errorf("Schedule() got gpu = %v, want %v", got.Name, tt.wantgpu) } // Check remaining resources @@ -138,57 +138,57 @@ func TestNaiveScheduler_Schedule(t *testing.T) { } } -func TestNaiveScheduler_NodeOperations(t *testing.T) { +func TestNaiveScheduler_gpuOperations(t *testing.T) { s := NewNaiveScheduler() - node1 := createGPUNode("node1", "100", "16Gi") + gpu1 := createGPU("gpu1", "100", "16Gi") request := createRequest("50", "8Gi") // Test OnAdd - s.OnAdd(node1) + s.OnAdd(gpu1) got, err := s.Schedule(request) - if err != nil || got.Name != "node1" { - t.Errorf("After OnAdd: Schedule() got = %v, want node1", got) + if err != nil || got.Name != "gpu1" { + t.Errorf("After OnAdd: Schedule() got = %v, want gpu1", got) } // Test OnUpdate - node1Updated := createGPUNode("node1", "40", "16Gi") - s.OnUpdate(node1, node1Updated) + gpu1Updated := createGPU("gpu1", "40", "16Gi") + s.OnUpdate(gpu1, gpu1Updated) _, err = s.Schedule(request) if err == nil { t.Error("After OnUpdate: Schedule() should fail with insufficient resources") } // Test OnDelete - s.OnDelete(node1Updated) + s.OnDelete(gpu1Updated) _, err = s.Schedule(request) if err == nil { - t.Error("After OnDelete: Schedule() should fail with no nodes") + t.Error("After OnDelete: Schedule() should fail with no gpus") } } func TestNaiveScheduler_Release(t *testing.T) { tests := []struct { name string - node *tfv1.GPUNode - schedule *tfv1.Resource - release *tfv1.Resource - wantError bool + gpu *tfv1.GPU + schedule *tfv1.Resource + release *tfv1.Resource + wantError bool wantRemainingTflops string wantRemainingVram string }{ { - name: "release non-existent node", - node: createGPUNode("node1", "100", "16Gi"), + name: "release non-existent gpu", + gpu: createGPU("gpu1", "100", "16Gi"), release: &tfv1.Resource{}, wantError: true, }, { name: "release after scheduling", - node: &tfv1.GPUNode{ + gpu: &tfv1.GPU{ ObjectMeta: metav1.ObjectMeta{ - Name: "node1", + Name: "gpu1", }, - Status: tfv1.GPUNodeStatus{ + Status: tfv1.GPUStatus{ Capacity: tfv1.Resource{ Tflops: resource.MustParse("100"), Vram: resource.MustParse("16Gi"), @@ -207,17 +207,17 @@ func TestNaiveScheduler_Release(t *testing.T) { Tflops: resource.MustParse("50"), Vram: resource.MustParse("8Gi"), }, - wantError: false, + wantError: false, wantRemainingTflops: "100", wantRemainingVram: "16Gi", }, { name: "partial release", - node: &tfv1.GPUNode{ + gpu: &tfv1.GPU{ ObjectMeta: metav1.ObjectMeta{ - Name: "node1", + Name: "gpu1", }, - Status: tfv1.GPUNodeStatus{ + Status: tfv1.GPUStatus{ Capacity: tfv1.Resource{ Tflops: resource.MustParse("100"), Vram: resource.MustParse("16Gi"), @@ -236,7 +236,7 @@ func TestNaiveScheduler_Release(t *testing.T) { Tflops: resource.MustParse("30"), Vram: resource.MustParse("5Gi"), }, - wantError: false, + wantError: false, wantRemainingTflops: "70", wantRemainingVram: "11Gi", }, @@ -247,31 +247,31 @@ func TestNaiveScheduler_Release(t *testing.T) { s := NewNaiveScheduler() if !tt.wantError { - // Add the node first - s.OnAdd(tt.node) + // Add the gpu first + s.OnAdd(tt.gpu) // Schedule some resources if needed if tt.schedule != nil { - node, err := s.Schedule(*tt.schedule) + gpu, err := s.Schedule(*tt.schedule) if err != nil { t.Errorf("Schedule() error = %v", err) return } // Verify resources were allocated - expectedTflops := tt.node.Status.Capacity.Tflops.DeepCopy() - expectedVram := tt.node.Status.Capacity.Vram.DeepCopy() + expectedTflops := tt.gpu.Status.Capacity.Tflops.DeepCopy() + expectedVram := tt.gpu.Status.Capacity.Vram.DeepCopy() expectedTflops.Sub(tt.schedule.Tflops) expectedVram.Sub(tt.schedule.Vram) - if node.Status.Available.Tflops.Cmp(expectedTflops) != 0 || - node.Status.Available.Vram.Cmp(expectedVram) != 0 { + if gpu.Status.Available.Tflops.Cmp(expectedTflops) != 0 || + gpu.Status.Available.Vram.Cmp(expectedVram) != 0 { t.Errorf("Schedule() did not allocate resources correctly") return } } } - err := s.Release(*tt.release, tt.node) + err := s.Release(*tt.release, tt.gpu) if (err != nil) != tt.wantError { t.Errorf("Release() error = %v, wantError %v", err, tt.wantError) return @@ -279,12 +279,12 @@ func TestNaiveScheduler_Release(t *testing.T) { if !tt.wantError { // Verify resources were restored correctly - node := s.nodes[tt.node.Name] - if node.Status.Available.Tflops.String() != tt.wantRemainingTflops || - node.Status.Available.Vram.String() != tt.wantRemainingVram { + gpu := s.gpus[tt.gpu.Name] + if gpu.Status.Available.Tflops.String() != tt.wantRemainingTflops || + gpu.Status.Available.Vram.String() != tt.wantRemainingVram { t.Errorf("Release() resources incorrect, got tflops=%v vram=%v, want tflops=%v vram=%v", - node.Status.Available.Tflops.String(), - node.Status.Available.Vram.String(), + gpu.Status.Available.Tflops.String(), + gpu.Status.Available.Vram.String(), tt.wantRemainingTflops, tt.wantRemainingVram) } diff --git a/internal/scheduler/scheduler.go b/internal/scheduler/scheduler.go index bd18893..a17ed2b 100644 --- a/internal/scheduler/scheduler.go +++ b/internal/scheduler/scheduler.go @@ -4,20 +4,18 @@ import ( tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1" ) -// Scheduler is the interface that wraps the scheduling methods +// Scheduler is the interface that wraps the basic scheduling methods. type Scheduler interface { - // Schedule takes a Resource Request and returns the pointer of the GPU node - // that can accommodate the request. If no suitable node is found, it returns - // an nil pointer and an error. - Schedule(request tfv1.Resource) (*tfv1.GPUNode, error) + // Schedule schedules a request to a gpu. + Schedule(request tfv1.Resource) (*tfv1.GPU, error) - // Release frees the allocated resources of a node - Release(request tfv1.Resource, node *tfv1.GPUNode) error + // Release releases a request from a gpu. + Release(request tfv1.Resource, gpu *tfv1.GPU) error - // OnAdd is called when a new node is added - OnAdd(node *tfv1.GPUNode) - // OnUpdate is called when a node is modified - OnUpdate(oldNode, newNode *tfv1.GPUNode) - // OnDelete is called when a node is deleted - OnDelete(node *tfv1.GPUNode) + // OnAdd is called when a gpu is added. + OnAdd(gpu *tfv1.GPU) + // OnUpdate is called when a gpu is updated. + OnUpdate(oldGPU, newGPU *tfv1.GPU) + // OnDelete is called when a gpu is deleted. + OnDelete(gpu *tfv1.GPU) } diff --git a/internal/worker/worker.go b/internal/worker/worker.go index 74b93d3..d3509c5 100644 --- a/internal/worker/worker.go +++ b/internal/worker/worker.go @@ -4,6 +4,6 @@ import ( tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1" ) -func GenerateConnectionURL(_node *tfv1.GPUNode, _connection *tfv1.TensorFusionConnection) string { +func GenerateConnectionURL(_gpu *tfv1.GPU, _connection *tfv1.TensorFusionConnection) string { return "TODO://" } From e9de90245af54e95504fab7f9160d90dad6a54c7 Mon Sep 17 00:00:00 2001 From: 0x5457 <0x5457@protonmail.com> Date: Mon, 9 Dec 2024 08:20:44 +0000 Subject: [PATCH 20/22] feat: add release.yaml --- .github/workflows/release.yml | 57 +++++++++++++++++++++++++++++++++++ Makefile | 2 +- 2 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/release.yml diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..4194e51 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,57 @@ +name: Release + +on: + push: + branches: + - main + workflow_dispatch: + +jobs: + release: + runs-on: ubuntu-20.04 + outputs: + published: ${{ steps.semantic.outputs.new_release_published }} + version: ${{ steps.semantic.outputs.new_release_version }} + steps: + - uses: actions/checkout@v3 + + - name: Semantic Release + id: semantic + uses: cycjimmy/semantic-release-action@v3.4.1 + with: + semantic_version: 18 + extra_plugins: | + @semantic-release/release-notes-generator@^10 + @semantic-release/github@^8 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + publish_image: + needs: + - release + if: needs.release.outputs.published == 'true' + runs-on: ubuntu-latest + outputs: + image_digest: ${{ steps.build.outputs.digest }} + steps: + - uses: actions/checkout@v3 + - id: meta + uses: docker/metadata-action@v4 + with: + images: | + tensorfusion/tensor-fusion-operator + tags: | + type=semver,pattern={{needs.release.outputs.version}} + - name: Login to DockerHub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + + - name: Build and push + uses: docker/build-push-action@v3 + with: + context: . + push: true + tags: ${{ steps.meta.outputs.tags }} + no-cache: true diff --git a/Makefile b/Makefile index a95fe02..73bfa20 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Image URL to use all building/pushing image targets -IMG ?= controller:latest +IMG ?= tensorfusion/tensor-fusion-operator:latest # ENVTEST_K8S_VERSION refers to the version of kubebuilder assets to be downloaded by envtest binary. ENVTEST_K8S_VERSION = 1.31.0 From 350525a8d8febc7ec4b888bc5e40c2b259bebe54 Mon Sep 17 00:00:00 2001 From: 0x5457 <0x5457@protonmail.com> Date: Mon, 9 Dec 2024 08:39:39 +0000 Subject: [PATCH 21/22] feat(webhook): fix tensor fusion pod mutation handler tests --- .github/workflows/test-e2e.yml | 5 +++-- ...sor-fusion.ai_tensorfusionconnections.yaml | 2 +- internal/webhook/v1/pod_webhook.go | 21 ++++++++++++------- internal/webhook/v1/pod_webhook_test.go | 9 ++++---- 4 files changed, 23 insertions(+), 14 deletions(-) diff --git a/.github/workflows/test-e2e.yml b/.github/workflows/test-e2e.yml index 8780644..70a12f2 100644 --- a/.github/workflows/test-e2e.yml +++ b/.github/workflows/test-e2e.yml @@ -1,8 +1,9 @@ name: E2E Tests on: - push: - pull_request: + workflow_dispatch: + # push: + # pull_request: jobs: test-e2e: diff --git a/config/crd/bases/tensor-fusion.ai_tensorfusionconnections.yaml b/config/crd/bases/tensor-fusion.ai_tensorfusionconnections.yaml index 7b2b288..37d47be 100644 --- a/config/crd/bases/tensor-fusion.ai_tensorfusionconnections.yaml +++ b/config/crd/bases/tensor-fusion.ai_tensorfusionconnections.yaml @@ -91,7 +91,7 @@ spec: properties: connectionURL: type: string - node: + gpu: type: string phase: type: string diff --git a/internal/webhook/v1/pod_webhook.go b/internal/webhook/v1/pod_webhook.go index 2b9bfcf..16cc35d 100644 --- a/internal/webhook/v1/pod_webhook.go +++ b/internal/webhook/v1/pod_webhook.go @@ -67,6 +67,10 @@ func (m *TensorFusionPodMutator) Handle(ctx context.Context, req admission.Reque log.Info("Mutating pod", "name", pod.Name, "namespace", pod.Namespace) reqs := parseTFReq(pod) + if len(reqs) == 0 { + return admission.Allowed("no tensor fusion requirements found") + } + // 1. Inject initContainer and env variables patches, err := m.patchTFClient(pod, reqs) if err != nil { @@ -108,9 +112,14 @@ func parseTFReq(pod *corev1.Pod) []TFReq { for _, container := range pod.Spec.Containers { containerName := container.Name - // Check if tensor fusion is enabled for this container - enableKey := fmt.Sprintf(constants.EnableContainerAnnotationFormat, containerName) - if enableStr, ok := pod.Annotations[enableKey]; !ok || enableStr != "true" { + // Check if TF requirements exist for this container + tflopsKey := fmt.Sprintf(constants.TFLOPSContainerAnnotationFormat, containerName) + vramKey := fmt.Sprintf(constants.VRAMContainerAnnotationFormat, containerName) + + tflopsStr, hasTflops := pod.Annotations[tflopsKey] + vramStr, hasVram := pod.Annotations[vramKey] + + if !hasTflops && !hasVram { continue } @@ -119,8 +128,7 @@ func parseTFReq(pod *corev1.Pod) []TFReq { } // Parse TFLOPS requirement - tflopsKey := fmt.Sprintf(constants.TFLOPSContainerAnnotationFormat, containerName) - if tflopsStr, ok := pod.Annotations[tflopsKey]; ok { + if hasTflops { tflops, err := resource.ParseQuantity(tflopsStr) if err == nil { req.Tflops = tflops @@ -128,8 +136,7 @@ func parseTFReq(pod *corev1.Pod) []TFReq { } // Parse VRAM requirement - vramKey := fmt.Sprintf(constants.VRAMContainerAnnotationFormat, containerName) - if vramStr, ok := pod.Annotations[vramKey]; ok { + if hasVram { vram, err := resource.ParseQuantity(vramStr) if err == nil { req.Vram = vram diff --git a/internal/webhook/v1/pod_webhook_test.go b/internal/webhook/v1/pod_webhook_test.go index 87eaa79..73b401d 100644 --- a/internal/webhook/v1/pod_webhook_test.go +++ b/internal/webhook/v1/pod_webhook_test.go @@ -23,6 +23,7 @@ import ( tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1" "github.com/NexusGPU/tensor-fusion-operator/internal/config" + "github.com/NexusGPU/tensor-fusion-operator/internal/constants" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" admissionv1 "k8s.io/api/admission/v1" @@ -67,8 +68,8 @@ var _ = Describe("TensorFusionPodMutator", func() { Name: "test-pod", Namespace: "default", Annotations: map[string]string{ - "tf.nexusgpu.com/tflops": "100", - "tf.nexusgpu.com/vram": "16Gi", + constants.TensorFusionDomain + "/tflops-main": "100", + constants.TensorFusionDomain + "/vram-main": "16Gi", }, }, Spec: corev1.PodSpec{ @@ -158,8 +159,8 @@ var _ = Describe("TensorFusionPodMutator", func() { pod := &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ Annotations: map[string]string{ - "tf.nexusgpu.com/tflops": "100", - "tf.nexusgpu.com/vram": "16Gi", + constants.TensorFusionDomain + "/tflops-test-container": "100", + constants.TensorFusionDomain + "/vram-test-container": "16Gi", }, }, Spec: corev1.PodSpec{ From 4213e1e2077653a2ee563e152f00a40d39fb1255 Mon Sep 17 00:00:00 2001 From: 0x5457 <0x5457@protonmail.com> Date: Mon, 9 Dec 2024 09:18:15 +0000 Subject: [PATCH 22/22] ci: update release workflow to use ubuntu-latest runner --- .github/workflows/release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 4194e51..44d7181 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -8,7 +8,7 @@ on: jobs: release: - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest outputs: published: ${{ steps.semantic.outputs.new_release_published }} version: ${{ steps.semantic.outputs.new_release_version }}