From 492f074127f72b330f98d9af193e0ce5cc3ca2b2 Mon Sep 17 00:00:00 2001
From: 0x5457 <0x5457@protonmail.com>
Date: Tue, 3 Dec 2024 05:43:57 +0000
Subject: [PATCH 01/22] chore: kubebuilder init

---
 .devcontainer/devcontainer.json               |  25 ++
 .devcontainer/post-install.sh                 |  23 ++
 .dockerignore                                 |   3 +
 .github/workflows/lint.yml                    |  23 ++
 .github/workflows/test-e2e.yml                |  35 ++
 .github/workflows/test.yml                    |  23 ++
 .gitignore                                    |  27 ++
 .golangci.yml                                 |  47 +++
 Dockerfile                                    |  33 ++
 Makefile                                      | 212 ++++++++++++
 PROJECT                                       |  10 +
 cmd/main.go                                   | 157 +++++++++
 config/default/kustomization.yaml             | 177 ++++++++++
 config/default/manager_metrics_patch.yaml     |   4 +
 config/default/metrics_service.yaml           |  17 +
 config/manager/kustomization.yaml             |   2 +
 config/manager/manager.yaml                   |  95 ++++++
 .../network-policy/allow-metrics-traffic.yaml |  26 ++
 config/network-policy/kustomization.yaml      |   2 +
 config/prometheus/kustomization.yaml          |   2 +
 config/prometheus/monitor.yaml                |  30 ++
 config/rbac/kustomization.yaml                |  20 ++
 config/rbac/leader_election_role.yaml         |  40 +++
 config/rbac/leader_election_role_binding.yaml |  15 +
 config/rbac/metrics_auth_role.yaml            |  17 +
 config/rbac/metrics_auth_role_binding.yaml    |  12 +
 config/rbac/metrics_reader_role.yaml          |   9 +
 config/rbac/role.yaml                         |  11 +
 config/rbac/role_binding.yaml                 |  15 +
 config/rbac/service_account.yaml              |   8 +
 go.mod                                        |  98 ++++++
 go.sum                                        | 251 ++++++++++++++
 hack/boilerplate.go.txt                       |  15 +
 test/e2e/e2e_suite_test.go                    | 120 +++++++
 test/e2e/e2e_test.go                          | 307 ++++++++++++++++++
 test/utils/utils.go                           | 251 ++++++++++++++
 36 files changed, 2162 insertions(+)
 create mode 100644 .devcontainer/devcontainer.json
 create mode 100644 .devcontainer/post-install.sh
 create mode 100644 .dockerignore
 create mode 100644 .github/workflows/lint.yml
 create mode 100644 .github/workflows/test-e2e.yml
 create mode 100644 .github/workflows/test.yml
 create mode 100644 .gitignore
 create mode 100644 .golangci.yml
 create mode 100644 Dockerfile
 create mode 100644 Makefile
 create mode 100644 PROJECT
 create mode 100644 cmd/main.go
 create mode 100644 config/default/kustomization.yaml
 create mode 100644 config/default/manager_metrics_patch.yaml
 create mode 100644 config/default/metrics_service.yaml
 create mode 100644 config/manager/kustomization.yaml
 create mode 100644 config/manager/manager.yaml
 create mode 100644 config/network-policy/allow-metrics-traffic.yaml
 create mode 100644 config/network-policy/kustomization.yaml
 create mode 100644 config/prometheus/kustomization.yaml
 create mode 100644 config/prometheus/monitor.yaml
 create mode 100644 config/rbac/kustomization.yaml
 create mode 100644 config/rbac/leader_election_role.yaml
 create mode 100644 config/rbac/leader_election_role_binding.yaml
 create mode 100644 config/rbac/metrics_auth_role.yaml
 create mode 100644 config/rbac/metrics_auth_role_binding.yaml
 create mode 100644 config/rbac/metrics_reader_role.yaml
 create mode 100644 config/rbac/role.yaml
 create mode 100644 config/rbac/role_binding.yaml
 create mode 100644 config/rbac/service_account.yaml
 create mode 100644 go.mod
 create mode 100644 go.sum
 create mode 100644 hack/boilerplate.go.txt
 create mode 100644 test/e2e/e2e_suite_test.go
 create mode 100644 test/e2e/e2e_test.go
 create mode 100644 test/utils/utils.go

diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
new file mode 100644
index 0000000..e2cdc09
--- /dev/null
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,25 @@
+{
+  "name": "Kubebuilder DevContainer",
+  "image": "golang:1.22",
+  "features": {
+    "ghcr.io/devcontainers/features/docker-in-docker:2": {},
+    "ghcr.io/devcontainers/features/git:1": {}
+  },
+
+  "runArgs": ["--network=host"],
+
+  "customizations": {
+    "vscode": {
+      "settings": {
+        "terminal.integrated.shell.linux": "/bin/bash"
+      },
+      "extensions": [
+        "ms-kubernetes-tools.vscode-kubernetes-tools",
+        "ms-azuretools.vscode-docker"
+      ]
+    }
+  },
+
+  "onCreateCommand": "bash .devcontainer/post-install.sh"
+}
+
diff --git a/.devcontainer/post-install.sh b/.devcontainer/post-install.sh
new file mode 100644
index 0000000..265c43e
--- /dev/null
+++ b/.devcontainer/post-install.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+set -x
+
+curl -Lo ./kind https://kind.sigs.k8s.io/dl/latest/kind-linux-amd64
+chmod +x ./kind
+mv ./kind /usr/local/bin/kind
+
+curl -L -o kubebuilder https://go.kubebuilder.io/dl/latest/linux/amd64
+chmod +x kubebuilder
+mv kubebuilder /usr/local/bin/
+
+KUBECTL_VERSION=$(curl -L -s https://dl.k8s.io/release/stable.txt)
+curl -LO "https://dl.k8s.io/release/$KUBECTL_VERSION/bin/linux/amd64/kubectl"
+chmod +x kubectl
+mv kubectl /usr/local/bin/kubectl
+
+docker network create -d=bridge --subnet=172.19.0.0/24 kind
+
+kind version
+kubebuilder version
+docker --version
+go version
+kubectl version --client
diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..a3aab7a
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,3 @@
+# More info: https://docs.docker.com/engine/reference/builder/#dockerignore-file
+# Ignore build and test binaries.
+bin/
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
new file mode 100644
index 0000000..f40d365
--- /dev/null
+++ b/.github/workflows/lint.yml
@@ -0,0 +1,23 @@
+name: Lint
+
+on:
+  push:
+  pull_request:
+
+jobs:
+  lint:
+    name: Run on Ubuntu
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone the code
+        uses: actions/checkout@v4
+
+      - name: Setup Go
+        uses: actions/setup-go@v5
+        with:
+          go-version: '~1.22'
+
+      - name: Run linter
+        uses: golangci/golangci-lint-action@v6
+        with:
+          version: v1.61
diff --git a/.github/workflows/test-e2e.yml b/.github/workflows/test-e2e.yml
new file mode 100644
index 0000000..8780644
--- /dev/null
+++ b/.github/workflows/test-e2e.yml
@@ -0,0 +1,35 @@
+name: E2E Tests
+
+on:
+  push:
+  pull_request:
+
+jobs:
+  test-e2e:
+    name: Run on Ubuntu
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone the code
+        uses: actions/checkout@v4
+
+      - name: Setup Go
+        uses: actions/setup-go@v5
+        with:
+          go-version: '~1.22'
+
+      - name: Install the latest version of kind
+        run: |
+          curl -Lo ./kind https://kind.sigs.k8s.io/dl/latest/kind-linux-amd64
+          chmod +x ./kind
+          sudo mv ./kind /usr/local/bin/kind
+
+      - name: Verify kind installation
+        run: kind version
+
+      - name: Create kind cluster
+        run: kind create cluster
+
+      - name: Running Test e2e
+        run: |
+          go mod tidy
+          make test-e2e
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 0000000..7baf657
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,23 @@
+name: Tests
+
+on:
+  push:
+  pull_request:
+
+jobs:
+  test:
+    name: Run on Ubuntu
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone the code
+        uses: actions/checkout@v4
+
+      - name: Setup Go
+        uses: actions/setup-go@v5
+        with:
+          go-version: '~1.22'
+
+      - name: Running Tests
+        run: |
+          go mod tidy
+          make test
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ada68ff
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,27 @@
+# Binaries for programs and plugins
+*.exe
+*.exe~
+*.dll
+*.so
+*.dylib
+bin/*
+Dockerfile.cross
+
+# Test binary, built with `go test -c`
+*.test
+
+# Output of the go coverage tool, specifically when used with LiteIDE
+*.out
+
+# Go workspace file
+go.work
+
+# Kubernetes Generated files - skip generated files, except for vendored files
+!vendor/**/zz_generated.*
+
+# editor and IDE paraphernalia
+.idea
+.vscode
+*.swp
+*.swo
+*~
diff --git a/.golangci.yml b/.golangci.yml
new file mode 100644
index 0000000..6b29746
--- /dev/null
+++ b/.golangci.yml
@@ -0,0 +1,47 @@
+run:
+  timeout: 5m
+  allow-parallel-runners: true
+
+issues:
+  # don't skip warning about doc comments
+  # don't exclude the default set of lint
+  exclude-use-default: false
+  # restore some of the defaults
+  # (fill in the rest as needed)
+  exclude-rules:
+    - path: "api/*"
+      linters:
+        - lll
+    - path: "internal/*"
+      linters:
+        - dupl
+        - lll
+linters:
+  disable-all: true
+  enable:
+    - dupl
+    - errcheck
+    - copyloopvar
+    - ginkgolinter
+    - goconst
+    - gocyclo
+    - gofmt
+    - goimports
+    - gosimple
+    - govet
+    - ineffassign
+    - lll
+    - misspell
+    - nakedret
+    - prealloc
+    - revive
+    - staticcheck
+    - typecheck
+    - unconvert
+    - unparam
+    - unused
+
+linters-settings:
+  revive:
+    rules:
+      - name: comment-spacings
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..4ba18b6
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,33 @@
+# Build the manager binary
+FROM golang:1.22 AS builder
+ARG TARGETOS
+ARG TARGETARCH
+
+WORKDIR /workspace
+# Copy the Go Modules manifests
+COPY go.mod go.mod
+COPY go.sum go.sum
+# cache deps before building and copying source so that we don't need to re-download as much
+# and so that source changes don't invalidate our downloaded layer
+RUN go mod download
+
+# Copy the go source
+COPY cmd/main.go cmd/main.go
+COPY api/ api/
+COPY internal/ internal/
+
+# Build
+# the GOARCH has not a default value to allow the binary be built according to the host where the command
+# was called. For example, if we call make docker-build in a local env which has the Apple Silicon M1 SO
+# the docker BUILDPLATFORM arg will be linux/arm64 when for Apple x86 it will be linux/amd64. Therefore,
+# by leaving it empty we can ensure that the container and binary shipped on it will have the same platform.
+RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} go build -a -o manager cmd/main.go
+
+# Use distroless as minimal base image to package the manager binary
+# Refer to https://github.com/GoogleContainerTools/distroless for more details
+FROM gcr.io/distroless/static:nonroot
+WORKDIR /
+COPY --from=builder /workspace/manager .
+USER 65532:65532
+
+ENTRYPOINT ["/manager"]
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..a95fe02
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,212 @@
+# Image URL to use all building/pushing image targets
+IMG ?= controller:latest
+# ENVTEST_K8S_VERSION refers to the version of kubebuilder assets to be downloaded by envtest binary.
+ENVTEST_K8S_VERSION = 1.31.0
+
+# Get the currently used golang install path (in GOPATH/bin, unless GOBIN is set)
+ifeq (,$(shell go env GOBIN))
+GOBIN=$(shell go env GOPATH)/bin
+else
+GOBIN=$(shell go env GOBIN)
+endif
+
+# CONTAINER_TOOL defines the container tool to be used for building images.
+# Be aware that the target commands are only tested with Docker which is
+# scaffolded by default. However, you might want to replace it to use other
+# tools. (i.e. podman)
+CONTAINER_TOOL ?= docker
+
+# Setting SHELL to bash allows bash commands to be executed by recipes.
+# Options are set to exit when a recipe line exits non-zero or a piped command fails.
+SHELL = /usr/bin/env bash -o pipefail
+.SHELLFLAGS = -ec
+
+.PHONY: all
+all: build
+
+##@ General
+
+# The help target prints out all targets with their descriptions organized
+# beneath their categories. The categories are represented by '##@' and the
+# target descriptions by '##'. The awk command is responsible for reading the
+# entire set of makefiles included in this invocation, looking for lines of the
+# file as xyz: ## something, and then pretty-format the target and help. Then,
+# if there's a line with ##@ something, that gets pretty-printed as a category.
+# More info on the usage of ANSI control characters for terminal formatting:
+# https://en.wikipedia.org/wiki/ANSI_escape_code#SGR_parameters
+# More info on the awk command:
+# http://linuxcommand.org/lc3_adv_awk.php
+
+.PHONY: help
+help: ## Display this help.
+	@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n  make \033[36m<target>\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf "  \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)
+
+##@ Development
+
+.PHONY: manifests
+manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects.
+	$(CONTROLLER_GEN) rbac:roleName=manager-role crd webhook paths="./..." output:crd:artifacts:config=config/crd/bases
+
+.PHONY: generate
+generate: controller-gen ## Generate code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations.
+	$(CONTROLLER_GEN) object:headerFile="hack/boilerplate.go.txt" paths="./..."
+
+.PHONY: fmt
+fmt: ## Run go fmt against code.
+	go fmt ./...
+
+.PHONY: vet
+vet: ## Run go vet against code.
+	go vet ./...
+
+.PHONY: test
+test: manifests generate fmt vet envtest ## Run tests.
+	KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $$(go list ./... | grep -v /e2e) -coverprofile cover.out
+
+# TODO(user): To use a different vendor for e2e tests, modify the setup under 'tests/e2e'.
+# The default setup assumes Kind is pre-installed and builds/loads the Manager Docker image locally.
+# Prometheus and CertManager are installed by default; skip with:
+# - PROMETHEUS_INSTALL_SKIP=true
+# - CERT_MANAGER_INSTALL_SKIP=true
+.PHONY: test-e2e
+test-e2e: manifests generate fmt vet ## Run the e2e tests. Expected an isolated environment using Kind.
+	@command -v kind >/dev/null 2>&1 || { \
+		echo "Kind is not installed. Please install Kind manually."; \
+		exit 1; \
+	}
+	@kind get clusters | grep -q 'kind' || { \
+		echo "No Kind cluster is running. Please start a Kind cluster before running the e2e tests."; \
+		exit 1; \
+	}
+	go test ./test/e2e/ -v -ginkgo.v
+
+.PHONY: lint
+lint: golangci-lint ## Run golangci-lint linter
+	$(GOLANGCI_LINT) run
+
+.PHONY: lint-fix
+lint-fix: golangci-lint ## Run golangci-lint linter and perform fixes
+	$(GOLANGCI_LINT) run --fix
+
+##@ Build
+
+.PHONY: build
+build: manifests generate fmt vet ## Build manager binary.
+	go build -o bin/manager cmd/main.go
+
+.PHONY: run
+run: manifests generate fmt vet ## Run a controller from your host.
+	go run ./cmd/main.go
+
+# If you wish to build the manager image targeting other platforms you can use the --platform flag.
+# (i.e. docker build --platform linux/arm64). However, you must enable docker buildKit for it.
+# More info: https://docs.docker.com/develop/develop-images/build_enhancements/
+.PHONY: docker-build
+docker-build: ## Build docker image with the manager.
+	$(CONTAINER_TOOL) build -t ${IMG} .
+
+.PHONY: docker-push
+docker-push: ## Push docker image with the manager.
+	$(CONTAINER_TOOL) push ${IMG}
+
+# PLATFORMS defines the target platforms for the manager image be built to provide support to multiple
+# architectures. (i.e. make docker-buildx IMG=myregistry/mypoperator:0.0.1). To use this option you need to:
+# - be able to use docker buildx. More info: https://docs.docker.com/build/buildx/
+# - have enabled BuildKit. More info: https://docs.docker.com/develop/develop-images/build_enhancements/
+# - be able to push the image to your registry (i.e. if you do not set a valid value via IMG=<myregistry/image:<tag>> then the export will fail)
+# To adequately provide solutions that are compatible with multiple platforms, you should consider using this option.
+PLATFORMS ?= linux/arm64,linux/amd64,linux/s390x,linux/ppc64le
+.PHONY: docker-buildx
+docker-buildx: ## Build and push docker image for the manager for cross-platform support
+	# copy existing Dockerfile and insert --platform=${BUILDPLATFORM} into Dockerfile.cross, and preserve the original Dockerfile
+	sed -e '1 s/\(^FROM\)/FROM --platform=\$$\{BUILDPLATFORM\}/; t' -e ' 1,// s//FROM --platform=\$$\{BUILDPLATFORM\}/' Dockerfile > Dockerfile.cross
+	- $(CONTAINER_TOOL) buildx create --name tensor-fusion-operator-builder
+	$(CONTAINER_TOOL) buildx use tensor-fusion-operator-builder
+	- $(CONTAINER_TOOL) buildx build --push --platform=$(PLATFORMS) --tag ${IMG} -f Dockerfile.cross .
+	- $(CONTAINER_TOOL) buildx rm tensor-fusion-operator-builder
+	rm Dockerfile.cross
+
+.PHONY: build-installer
+build-installer: manifests generate kustomize ## Generate a consolidated YAML with CRDs and deployment.
+	mkdir -p dist
+	cd config/manager && $(KUSTOMIZE) edit set image controller=${IMG}
+	$(KUSTOMIZE) build config/default > dist/install.yaml
+
+##@ Deployment
+
+ifndef ignore-not-found
+  ignore-not-found = false
+endif
+
+.PHONY: install
+install: manifests kustomize ## Install CRDs into the K8s cluster specified in ~/.kube/config.
+	$(KUSTOMIZE) build config/crd | $(KUBECTL) apply -f -
+
+.PHONY: uninstall
+uninstall: manifests kustomize ## Uninstall CRDs from the K8s cluster specified in ~/.kube/config. Call with ignore-not-found=true to ignore resource not found errors during deletion.
+	$(KUSTOMIZE) build config/crd | $(KUBECTL) delete --ignore-not-found=$(ignore-not-found) -f -
+
+.PHONY: deploy
+deploy: manifests kustomize ## Deploy controller to the K8s cluster specified in ~/.kube/config.
+	cd config/manager && $(KUSTOMIZE) edit set image controller=${IMG}
+	$(KUSTOMIZE) build config/default | $(KUBECTL) apply -f -
+
+.PHONY: undeploy
+undeploy: kustomize ## Undeploy controller from the K8s cluster specified in ~/.kube/config. Call with ignore-not-found=true to ignore resource not found errors during deletion.
+	$(KUSTOMIZE) build config/default | $(KUBECTL) delete --ignore-not-found=$(ignore-not-found) -f -
+
+##@ Dependencies
+
+## Location to install dependencies to
+LOCALBIN ?= $(shell pwd)/bin
+$(LOCALBIN):
+	mkdir -p $(LOCALBIN)
+
+## Tool Binaries
+KUBECTL ?= kubectl
+KUSTOMIZE ?= $(LOCALBIN)/kustomize
+CONTROLLER_GEN ?= $(LOCALBIN)/controller-gen
+ENVTEST ?= $(LOCALBIN)/setup-envtest
+GOLANGCI_LINT = $(LOCALBIN)/golangci-lint
+
+## Tool Versions
+KUSTOMIZE_VERSION ?= v5.5.0
+CONTROLLER_TOOLS_VERSION ?= v0.16.4
+ENVTEST_VERSION ?= release-0.19
+GOLANGCI_LINT_VERSION ?= v1.61.0
+
+.PHONY: kustomize
+kustomize: $(KUSTOMIZE) ## Download kustomize locally if necessary.
+$(KUSTOMIZE): $(LOCALBIN)
+	$(call go-install-tool,$(KUSTOMIZE),sigs.k8s.io/kustomize/kustomize/v5,$(KUSTOMIZE_VERSION))
+
+.PHONY: controller-gen
+controller-gen: $(CONTROLLER_GEN) ## Download controller-gen locally if necessary.
+$(CONTROLLER_GEN): $(LOCALBIN)
+	$(call go-install-tool,$(CONTROLLER_GEN),sigs.k8s.io/controller-tools/cmd/controller-gen,$(CONTROLLER_TOOLS_VERSION))
+
+.PHONY: envtest
+envtest: $(ENVTEST) ## Download setup-envtest locally if necessary.
+$(ENVTEST): $(LOCALBIN)
+	$(call go-install-tool,$(ENVTEST),sigs.k8s.io/controller-runtime/tools/setup-envtest,$(ENVTEST_VERSION))
+
+.PHONY: golangci-lint
+golangci-lint: $(GOLANGCI_LINT) ## Download golangci-lint locally if necessary.
+$(GOLANGCI_LINT): $(LOCALBIN)
+	$(call go-install-tool,$(GOLANGCI_LINT),github.com/golangci/golangci-lint/cmd/golangci-lint,$(GOLANGCI_LINT_VERSION))
+
+# go-install-tool will 'go install' any package with custom target and name of binary, if it doesn't exist
+# $1 - target path with name of binary
+# $2 - package url which can be installed
+# $3 - specific version of package
+define go-install-tool
+@[ -f "$(1)-$(3)" ] || { \
+set -e; \
+package=$(2)@$(3) ;\
+echo "Downloading $${package}" ;\
+rm -f $(1) || true ;\
+GOBIN=$(LOCALBIN) go install $${package} ;\
+mv $(1) $(1)-$(3) ;\
+} ;\
+ln -sf $(1)-$(3) $(1)
+endef
diff --git a/PROJECT b/PROJECT
new file mode 100644
index 0000000..cb60042
--- /dev/null
+++ b/PROJECT
@@ -0,0 +1,10 @@
+# Code generated by tool. DO NOT EDIT.
+# This file is used to track the info used to scaffold your project
+# and allow the plugins properly work.
+# More info: https://book.kubebuilder.io/reference/project-config.html
+domain: tensor-fusion.ai
+layout:
+- go.kubebuilder.io/v4
+projectName: tensor-fusion-operator
+repo: github.com/NexusGPU/tensor-fusion-operator
+version: "3"
diff --git a/cmd/main.go b/cmd/main.go
new file mode 100644
index 0000000..8992f96
--- /dev/null
+++ b/cmd/main.go
@@ -0,0 +1,157 @@
+/*
+Copyright 2024.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package main
+
+import (
+	"crypto/tls"
+	"flag"
+	"os"
+
+	// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
+	// to ensure that exec-entrypoint and run can make use of them.
+	_ "k8s.io/client-go/plugin/pkg/client/auth"
+
+	"k8s.io/apimachinery/pkg/runtime"
+	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
+	clientgoscheme "k8s.io/client-go/kubernetes/scheme"
+	ctrl "sigs.k8s.io/controller-runtime"
+	"sigs.k8s.io/controller-runtime/pkg/healthz"
+	"sigs.k8s.io/controller-runtime/pkg/log/zap"
+	"sigs.k8s.io/controller-runtime/pkg/metrics/filters"
+	metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
+	"sigs.k8s.io/controller-runtime/pkg/webhook"
+	// +kubebuilder:scaffold:imports
+)
+
+var (
+	scheme   = runtime.NewScheme()
+	setupLog = ctrl.Log.WithName("setup")
+)
+
+func init() {
+	utilruntime.Must(clientgoscheme.AddToScheme(scheme))
+
+	// +kubebuilder:scaffold:scheme
+}
+
+func main() {
+	var metricsAddr string
+	var enableLeaderElection bool
+	var probeAddr string
+	var secureMetrics bool
+	var enableHTTP2 bool
+	var tlsOpts []func(*tls.Config)
+	flag.StringVar(&metricsAddr, "metrics-bind-address", "0", "The address the metrics endpoint binds to. "+
+		"Use :8443 for HTTPS or :8080 for HTTP, or leave as 0 to disable the metrics service.")
+	flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
+	flag.BoolVar(&enableLeaderElection, "leader-elect", false,
+		"Enable leader election for controller manager. "+
+			"Enabling this will ensure there is only one active controller manager.")
+	flag.BoolVar(&secureMetrics, "metrics-secure", true,
+		"If set, the metrics endpoint is served securely via HTTPS. Use --metrics-secure=false to use HTTP instead.")
+	flag.BoolVar(&enableHTTP2, "enable-http2", false,
+		"If set, HTTP/2 will be enabled for the metrics and webhook servers")
+	opts := zap.Options{
+		Development: true,
+	}
+	opts.BindFlags(flag.CommandLine)
+	flag.Parse()
+
+	ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))
+
+	// if the enable-http2 flag is false (the default), http/2 should be disabled
+	// due to its vulnerabilities. More specifically, disabling http/2 will
+	// prevent from being vulnerable to the HTTP/2 Stream Cancellation and
+	// Rapid Reset CVEs. For more information see:
+	// - https://github.com/advisories/GHSA-qppj-fm5r-hxr3
+	// - https://github.com/advisories/GHSA-4374-p667-p6c8
+	disableHTTP2 := func(c *tls.Config) {
+		setupLog.Info("disabling http/2")
+		c.NextProtos = []string{"http/1.1"}
+	}
+
+	if !enableHTTP2 {
+		tlsOpts = append(tlsOpts, disableHTTP2)
+	}
+
+	webhookServer := webhook.NewServer(webhook.Options{
+		TLSOpts: tlsOpts,
+	})
+
+	// Metrics endpoint is enabled in 'config/default/kustomization.yaml'. The Metrics options configure the server.
+	// More info:
+	// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.19.1/pkg/metrics/server
+	// - https://book.kubebuilder.io/reference/metrics.html
+	metricsServerOptions := metricsserver.Options{
+		BindAddress:   metricsAddr,
+		SecureServing: secureMetrics,
+		TLSOpts:       tlsOpts,
+	}
+
+	if secureMetrics {
+		// FilterProvider is used to protect the metrics endpoint with authn/authz.
+		// These configurations ensure that only authorized users and service accounts
+		// can access the metrics endpoint. The RBAC are configured in 'config/rbac/kustomization.yaml'. More info:
+		// https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.19.1/pkg/metrics/filters#WithAuthenticationAndAuthorization
+		metricsServerOptions.FilterProvider = filters.WithAuthenticationAndAuthorization
+
+		// TODO(user): If CertDir, CertName, and KeyName are not specified, controller-runtime will automatically
+		// generate self-signed certificates for the metrics server. While convenient for development and testing,
+		// this setup is not recommended for production.
+	}
+
+	mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{
+		Scheme:                 scheme,
+		Metrics:                metricsServerOptions,
+		WebhookServer:          webhookServer,
+		HealthProbeBindAddress: probeAddr,
+		LeaderElection:         enableLeaderElection,
+		LeaderElectionID:       "85104305.tensor-fusion.ai",
+		// LeaderElectionReleaseOnCancel defines if the leader should step down voluntarily
+		// when the Manager ends. This requires the binary to immediately end when the
+		// Manager is stopped, otherwise, this setting is unsafe. Setting this significantly
+		// speeds up voluntary leader transitions as the new leader don't have to wait
+		// LeaseDuration time first.
+		//
+		// In the default scaffold provided, the program ends immediately after
+		// the manager stops, so would be fine to enable this option. However,
+		// if you are doing or is intended to do any operation such as perform cleanups
+		// after the manager stops then its usage might be unsafe.
+		// LeaderElectionReleaseOnCancel: true,
+	})
+	if err != nil {
+		setupLog.Error(err, "unable to start manager")
+		os.Exit(1)
+	}
+
+	// +kubebuilder:scaffold:builder
+
+	if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
+		setupLog.Error(err, "unable to set up health check")
+		os.Exit(1)
+	}
+	if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil {
+		setupLog.Error(err, "unable to set up ready check")
+		os.Exit(1)
+	}
+
+	setupLog.Info("starting manager")
+	if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil {
+		setupLog.Error(err, "problem running manager")
+		os.Exit(1)
+	}
+}
diff --git a/config/default/kustomization.yaml b/config/default/kustomization.yaml
new file mode 100644
index 0000000..c27f571
--- /dev/null
+++ b/config/default/kustomization.yaml
@@ -0,0 +1,177 @@
+# Adds namespace to all resources.
+namespace: tensor-fusion-operator-system
+
+# Value of this field is prepended to the
+# names of all resources, e.g. a deployment named
+# "wordpress" becomes "alices-wordpress".
+# Note that it should also match with the prefix (text before '-') of the namespace
+# field above.
+namePrefix: tensor-fusion-operator-
+
+# Labels to add to all resources and selectors.
+#labels:
+#- includeSelectors: true
+#  pairs:
+#    someName: someValue
+
+resources:
+#- ../crd
+- ../rbac
+- ../manager
+# [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in
+# crd/kustomization.yaml
+#- ../webhook
+# [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER'. 'WEBHOOK' components are required.
+#- ../certmanager
+# [PROMETHEUS] To enable prometheus monitor, uncomment all sections with 'PROMETHEUS'.
+#- ../prometheus
+# [METRICS] Expose the controller manager metrics service.
+- metrics_service.yaml
+# [NETWORK POLICY] Protect the /metrics endpoint and Webhook Server with NetworkPolicy.
+# Only Pod(s) running a namespace labeled with 'metrics: enabled' will be able to gather the metrics.
+# Only CR(s) which requires webhooks and are applied on namespaces labeled with 'webhooks: enabled' will
+# be able to communicate with the Webhook Server.
+#- ../network-policy
+
+# Uncomment the patches line if you enable Metrics, and/or are using webhooks and cert-manager
+patches:
+# [METRICS] The following patch will enable the metrics endpoint using HTTPS and the port :8443.
+# More info: https://book.kubebuilder.io/reference/metrics
+- path: manager_metrics_patch.yaml
+  target:
+    kind: Deployment
+
+# [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in
+# crd/kustomization.yaml
+#- path: manager_webhook_patch.yaml
+
+# [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER' prefix.
+# Uncomment the following replacements to add the cert-manager CA injection annotations
+#replacements:
+# - source: # Uncomment the following block if you have any webhook
+#     kind: Service
+#     version: v1
+#     name: webhook-service
+#     fieldPath: .metadata.name # Name of the service
+#   targets:
+#     - select:
+#         kind: Certificate
+#         group: cert-manager.io
+#         version: v1
+#       fieldPaths:
+#         - .spec.dnsNames.0
+#         - .spec.dnsNames.1
+#       options:
+#         delimiter: '.'
+#         index: 0
+#         create: true
+# - source:
+#     kind: Service
+#     version: v1
+#     name: webhook-service
+#     fieldPath: .metadata.namespace # Namespace of the service
+#   targets:
+#     - select:
+#         kind: Certificate
+#         group: cert-manager.io
+#         version: v1
+#       fieldPaths:
+#         - .spec.dnsNames.0
+#         - .spec.dnsNames.1
+#       options:
+#         delimiter: '.'
+#         index: 1
+#         create: true
+#
+# - source: # Uncomment the following block if you have a ValidatingWebhook (--programmatic-validation)
+#     kind: Certificate
+#     group: cert-manager.io
+#     version: v1
+#     name: serving-cert # This name should match the one in certificate.yaml
+#     fieldPath: .metadata.namespace # Namespace of the certificate CR
+#   targets:
+#     - select:
+#         kind: ValidatingWebhookConfiguration
+#       fieldPaths:
+#         - .metadata.annotations.[cert-manager.io/inject-ca-from]
+#       options:
+#         delimiter: '/'
+#         index: 0
+#         create: true
+# - source:
+#     kind: Certificate
+#     group: cert-manager.io
+#     version: v1
+#     name: serving-cert # This name should match the one in certificate.yaml
+#     fieldPath: .metadata.name
+#   targets:
+#     - select:
+#         kind: ValidatingWebhookConfiguration
+#       fieldPaths:
+#         - .metadata.annotations.[cert-manager.io/inject-ca-from]
+#       options:
+#         delimiter: '/'
+#         index: 1
+#         create: true
+#
+# - source: # Uncomment the following block if you have a DefaultingWebhook (--defaulting )
+#     kind: Certificate
+#     group: cert-manager.io
+#     version: v1
+#     name: serving-cert # This name should match the one in certificate.yaml
+#     fieldPath: .metadata.namespace # Namespace of the certificate CR
+#   targets:
+#     - select:
+#         kind: MutatingWebhookConfiguration
+#       fieldPaths:
+#         - .metadata.annotations.[cert-manager.io/inject-ca-from]
+#       options:
+#         delimiter: '/'
+#         index: 0
+#         create: true
+# - source:
+#     kind: Certificate
+#     group: cert-manager.io
+#     version: v1
+#     name: serving-cert # This name should match the one in certificate.yaml
+#     fieldPath: .metadata.name
+#   targets:
+#     - select:
+#         kind: MutatingWebhookConfiguration
+#       fieldPaths:
+#         - .metadata.annotations.[cert-manager.io/inject-ca-from]
+#       options:
+#         delimiter: '/'
+#         index: 1
+#         create: true
+#
+# - source: # Uncomment the following block if you have a ConversionWebhook (--conversion)
+#     kind: Certificate
+#     group: cert-manager.io
+#     version: v1
+#     name: serving-cert # This name should match the one in certificate.yaml
+#     fieldPath: .metadata.namespace # Namespace of the certificate CR
+#   targets:
+#     - select:
+#         kind: CustomResourceDefinition
+#       fieldPaths:
+#         - .metadata.annotations.[cert-manager.io/inject-ca-from]
+#       options:
+#         delimiter: '/'
+#         index: 0
+#         create: true
+# - source:
+#     kind: Certificate
+#     group: cert-manager.io
+#     version: v1
+#     name: serving-cert # This name should match the one in certificate.yaml
+#     fieldPath: .metadata.name
+#   targets:
+#     - select:
+#         kind: CustomResourceDefinition
+#       fieldPaths:
+#         - .metadata.annotations.[cert-manager.io/inject-ca-from]
+#       options:
+#         delimiter: '/'
+#         index: 1
+#         create: true
diff --git a/config/default/manager_metrics_patch.yaml b/config/default/manager_metrics_patch.yaml
new file mode 100644
index 0000000..2aaef65
--- /dev/null
+++ b/config/default/manager_metrics_patch.yaml
@@ -0,0 +1,4 @@
+# This patch adds the args to allow exposing the metrics endpoint using HTTPS
+- op: add
+  path: /spec/template/spec/containers/0/args/0
+  value: --metrics-bind-address=:8443
diff --git a/config/default/metrics_service.yaml b/config/default/metrics_service.yaml
new file mode 100644
index 0000000..82a1a1b
--- /dev/null
+++ b/config/default/metrics_service.yaml
@@ -0,0 +1,17 @@
+apiVersion: v1
+kind: Service
+metadata:
+  labels:
+    control-plane: controller-manager
+    app.kubernetes.io/name: tensor-fusion-operator
+    app.kubernetes.io/managed-by: kustomize
+  name: controller-manager-metrics-service
+  namespace: system
+spec:
+  ports:
+  - name: https
+    port: 8443
+    protocol: TCP
+    targetPort: 8443
+  selector:
+    control-plane: controller-manager
diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml
new file mode 100644
index 0000000..5c5f0b8
--- /dev/null
+++ b/config/manager/kustomization.yaml
@@ -0,0 +1,2 @@
+resources:
+- manager.yaml
diff --git a/config/manager/manager.yaml b/config/manager/manager.yaml
new file mode 100644
index 0000000..1286879
--- /dev/null
+++ b/config/manager/manager.yaml
@@ -0,0 +1,95 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  labels:
+    control-plane: controller-manager
+    app.kubernetes.io/name: tensor-fusion-operator
+    app.kubernetes.io/managed-by: kustomize
+  name: system
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: controller-manager
+  namespace: system
+  labels:
+    control-plane: controller-manager
+    app.kubernetes.io/name: tensor-fusion-operator
+    app.kubernetes.io/managed-by: kustomize
+spec:
+  selector:
+    matchLabels:
+      control-plane: controller-manager
+  replicas: 1
+  template:
+    metadata:
+      annotations:
+        kubectl.kubernetes.io/default-container: manager
+      labels:
+        control-plane: controller-manager
+    spec:
+      # TODO(user): Uncomment the following code to configure the nodeAffinity expression
+      # according to the platforms which are supported by your solution.
+      # It is considered best practice to support multiple architectures. You can
+      # build your manager image using the makefile target docker-buildx.
+      # affinity:
+      #   nodeAffinity:
+      #     requiredDuringSchedulingIgnoredDuringExecution:
+      #       nodeSelectorTerms:
+      #         - matchExpressions:
+      #           - key: kubernetes.io/arch
+      #             operator: In
+      #             values:
+      #               - amd64
+      #               - arm64
+      #               - ppc64le
+      #               - s390x
+      #           - key: kubernetes.io/os
+      #             operator: In
+      #             values:
+      #               - linux
+      securityContext:
+        runAsNonRoot: true
+        # TODO(user): For common cases that do not require escalating privileges
+        # it is recommended to ensure that all your Pods/Containers are restrictive.
+        # More info: https://kubernetes.io/docs/concepts/security/pod-security-standards/#restricted
+        # Please uncomment the following code if your project does NOT have to work on old Kubernetes
+        # versions < 1.19 or on vendors versions which do NOT support this field by default (i.e. Openshift < 4.11 ).
+        # seccompProfile:
+        #   type: RuntimeDefault
+      containers:
+      - command:
+        - /manager
+        args:
+          - --leader-elect
+          - --health-probe-bind-address=:8081
+        image: controller:latest
+        name: manager
+        securityContext:
+          allowPrivilegeEscalation: false
+          capabilities:
+            drop:
+            - "ALL"
+        livenessProbe:
+          httpGet:
+            path: /healthz
+            port: 8081
+          initialDelaySeconds: 15
+          periodSeconds: 20
+        readinessProbe:
+          httpGet:
+            path: /readyz
+            port: 8081
+          initialDelaySeconds: 5
+          periodSeconds: 10
+        # TODO(user): Configure the resources accordingly based on the project requirements.
+        # More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+        resources:
+          limits:
+            cpu: 500m
+            memory: 128Mi
+          requests:
+            cpu: 10m
+            memory: 64Mi
+      serviceAccountName: controller-manager
+      terminationGracePeriodSeconds: 10
diff --git a/config/network-policy/allow-metrics-traffic.yaml b/config/network-policy/allow-metrics-traffic.yaml
new file mode 100644
index 0000000..e6e9d7f
--- /dev/null
+++ b/config/network-policy/allow-metrics-traffic.yaml
@@ -0,0 +1,26 @@
+# This NetworkPolicy allows ingress traffic
+# with Pods running on namespaces labeled with 'metrics: enabled'. Only Pods on those
+# namespaces are able to gathering data from the metrics endpoint.
+apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+  labels:
+    app.kubernetes.io/name: tensor-fusion-operator
+    app.kubernetes.io/managed-by: kustomize
+  name: allow-metrics-traffic
+  namespace: system
+spec:
+  podSelector:
+    matchLabels:
+      control-plane: controller-manager
+  policyTypes:
+    - Ingress
+  ingress:
+    # This allows ingress traffic from any namespace with the label metrics: enabled
+    - from:
+      - namespaceSelector:
+          matchLabels:
+            metrics: enabled  # Only from namespaces with this label
+      ports:
+        - port: 8443
+          protocol: TCP
diff --git a/config/network-policy/kustomization.yaml b/config/network-policy/kustomization.yaml
new file mode 100644
index 0000000..ec0fb5e
--- /dev/null
+++ b/config/network-policy/kustomization.yaml
@@ -0,0 +1,2 @@
+resources:
+- allow-metrics-traffic.yaml
diff --git a/config/prometheus/kustomization.yaml b/config/prometheus/kustomization.yaml
new file mode 100644
index 0000000..ed13716
--- /dev/null
+++ b/config/prometheus/kustomization.yaml
@@ -0,0 +1,2 @@
+resources:
+- monitor.yaml
diff --git a/config/prometheus/monitor.yaml b/config/prometheus/monitor.yaml
new file mode 100644
index 0000000..f732325
--- /dev/null
+++ b/config/prometheus/monitor.yaml
@@ -0,0 +1,30 @@
+# Prometheus Monitor Service (Metrics)
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  labels:
+    control-plane: controller-manager
+    app.kubernetes.io/name: tensor-fusion-operator
+    app.kubernetes.io/managed-by: kustomize
+  name: controller-manager-metrics-monitor
+  namespace: system
+spec:
+  endpoints:
+    - path: /metrics
+      port: https # Ensure this is the name of the port that exposes HTTPS metrics
+      scheme: https
+      bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+      tlsConfig:
+        # TODO(user): The option insecureSkipVerify: true is not recommended for production since it disables
+        # certificate verification. This poses a significant security risk by making the system vulnerable to
+        # man-in-the-middle attacks, where an attacker could intercept and manipulate the communication between
+        # Prometheus and the monitored services. This could lead to unauthorized access to sensitive metrics data,
+        # compromising the integrity and confidentiality of the information.
+        # Please use the following options for secure configurations:
+        # caFile: /etc/metrics-certs/ca.crt
+        # certFile: /etc/metrics-certs/tls.crt
+        # keyFile: /etc/metrics-certs/tls.key
+        insecureSkipVerify: true
+  selector:
+    matchLabels:
+      control-plane: controller-manager
diff --git a/config/rbac/kustomization.yaml b/config/rbac/kustomization.yaml
new file mode 100644
index 0000000..5619aa0
--- /dev/null
+++ b/config/rbac/kustomization.yaml
@@ -0,0 +1,20 @@
+resources:
+# All RBAC will be applied under this service account in
+# the deployment namespace. You may comment out this resource
+# if your manager will use a service account that exists at
+# runtime. Be sure to update RoleBinding and ClusterRoleBinding
+# subjects if changing service account names.
+- service_account.yaml
+- role.yaml
+- role_binding.yaml
+- leader_election_role.yaml
+- leader_election_role_binding.yaml
+# The following RBAC configurations are used to protect
+# the metrics endpoint with authn/authz. These configurations
+# ensure that only authorized users and service accounts
+# can access the metrics endpoint. Comment the following
+# permissions if you want to disable this protection.
+# More info: https://book.kubebuilder.io/reference/metrics.html
+- metrics_auth_role.yaml
+- metrics_auth_role_binding.yaml
+- metrics_reader_role.yaml
diff --git a/config/rbac/leader_election_role.yaml b/config/rbac/leader_election_role.yaml
new file mode 100644
index 0000000..d77f50e
--- /dev/null
+++ b/config/rbac/leader_election_role.yaml
@@ -0,0 +1,40 @@
+# permissions to do leader election.
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  labels:
+    app.kubernetes.io/name: tensor-fusion-operator
+    app.kubernetes.io/managed-by: kustomize
+  name: leader-election-role
+rules:
+- apiGroups:
+  - ""
+  resources:
+  - configmaps
+  verbs:
+  - get
+  - list
+  - watch
+  - create
+  - update
+  - patch
+  - delete
+- apiGroups:
+  - coordination.k8s.io
+  resources:
+  - leases
+  verbs:
+  - get
+  - list
+  - watch
+  - create
+  - update
+  - patch
+  - delete
+- apiGroups:
+  - ""
+  resources:
+  - events
+  verbs:
+  - create
+  - patch
diff --git a/config/rbac/leader_election_role_binding.yaml b/config/rbac/leader_election_role_binding.yaml
new file mode 100644
index 0000000..9e4dd73
--- /dev/null
+++ b/config/rbac/leader_election_role_binding.yaml
@@ -0,0 +1,15 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  labels:
+    app.kubernetes.io/name: tensor-fusion-operator
+    app.kubernetes.io/managed-by: kustomize
+  name: leader-election-rolebinding
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: leader-election-role
+subjects:
+- kind: ServiceAccount
+  name: controller-manager
+  namespace: system
diff --git a/config/rbac/metrics_auth_role.yaml b/config/rbac/metrics_auth_role.yaml
new file mode 100644
index 0000000..32d2e4e
--- /dev/null
+++ b/config/rbac/metrics_auth_role.yaml
@@ -0,0 +1,17 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: metrics-auth-role
+rules:
+- apiGroups:
+  - authentication.k8s.io
+  resources:
+  - tokenreviews
+  verbs:
+  - create
+- apiGroups:
+  - authorization.k8s.io
+  resources:
+  - subjectaccessreviews
+  verbs:
+  - create
diff --git a/config/rbac/metrics_auth_role_binding.yaml b/config/rbac/metrics_auth_role_binding.yaml
new file mode 100644
index 0000000..e775d67
--- /dev/null
+++ b/config/rbac/metrics_auth_role_binding.yaml
@@ -0,0 +1,12 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: metrics-auth-rolebinding
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: metrics-auth-role
+subjects:
+- kind: ServiceAccount
+  name: controller-manager
+  namespace: system
diff --git a/config/rbac/metrics_reader_role.yaml b/config/rbac/metrics_reader_role.yaml
new file mode 100644
index 0000000..51a75db
--- /dev/null
+++ b/config/rbac/metrics_reader_role.yaml
@@ -0,0 +1,9 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: metrics-reader
+rules:
+- nonResourceURLs:
+  - "/metrics"
+  verbs:
+  - get
diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml
new file mode 100644
index 0000000..7454ff6
--- /dev/null
+++ b/config/rbac/role.yaml
@@ -0,0 +1,11 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/name: tensor-fusion-operator
+    app.kubernetes.io/managed-by: kustomize
+  name: manager-role
+rules:
+- apiGroups: [""]
+  resources: ["pods"]
+  verbs: ["get", "list", "watch"]
diff --git a/config/rbac/role_binding.yaml b/config/rbac/role_binding.yaml
new file mode 100644
index 0000000..6087d7e
--- /dev/null
+++ b/config/rbac/role_binding.yaml
@@ -0,0 +1,15 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  labels:
+    app.kubernetes.io/name: tensor-fusion-operator
+    app.kubernetes.io/managed-by: kustomize
+  name: manager-rolebinding
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: manager-role
+subjects:
+- kind: ServiceAccount
+  name: controller-manager
+  namespace: system
diff --git a/config/rbac/service_account.yaml b/config/rbac/service_account.yaml
new file mode 100644
index 0000000..20beb76
--- /dev/null
+++ b/config/rbac/service_account.yaml
@@ -0,0 +1,8 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  labels:
+    app.kubernetes.io/name: tensor-fusion-operator
+    app.kubernetes.io/managed-by: kustomize
+  name: controller-manager
+  namespace: system
diff --git a/go.mod b/go.mod
new file mode 100644
index 0000000..c5f3936
--- /dev/null
+++ b/go.mod
@@ -0,0 +1,98 @@
+module github.com/NexusGPU/tensor-fusion-operator
+
+go 1.22.0
+
+require (
+	github.com/onsi/ginkgo/v2 v2.19.0
+	github.com/onsi/gomega v1.33.1
+	k8s.io/apimachinery v0.31.0
+	k8s.io/client-go v0.31.0
+	sigs.k8s.io/controller-runtime v0.19.1
+)
+
+require (
+	github.com/antlr4-go/antlr/v4 v4.13.0 // indirect
+	github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a // indirect
+	github.com/beorn7/perks v1.0.1 // indirect
+	github.com/blang/semver/v4 v4.0.0 // indirect
+	github.com/cenkalti/backoff/v4 v4.3.0 // indirect
+	github.com/cespare/xxhash/v2 v2.3.0 // indirect
+	github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
+	github.com/emicklei/go-restful/v3 v3.11.0 // indirect
+	github.com/evanphx/json-patch/v5 v5.9.0 // indirect
+	github.com/felixge/httpsnoop v1.0.4 // indirect
+	github.com/fsnotify/fsnotify v1.7.0 // indirect
+	github.com/fxamacker/cbor/v2 v2.7.0 // indirect
+	github.com/go-logr/logr v1.4.2 // indirect
+	github.com/go-logr/stdr v1.2.2 // indirect
+	github.com/go-logr/zapr v1.3.0 // indirect
+	github.com/go-openapi/jsonpointer v0.19.6 // indirect
+	github.com/go-openapi/jsonreference v0.20.2 // indirect
+	github.com/go-openapi/swag v0.22.4 // indirect
+	github.com/go-task/slim-sprig/v3 v3.0.0 // indirect
+	github.com/gogo/protobuf v1.3.2 // indirect
+	github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
+	github.com/golang/protobuf v1.5.4 // indirect
+	github.com/google/cel-go v0.20.1 // indirect
+	github.com/google/gnostic-models v0.6.8 // indirect
+	github.com/google/go-cmp v0.6.0 // indirect
+	github.com/google/gofuzz v1.2.0 // indirect
+	github.com/google/pprof v0.0.0-20240525223248-4bfdf5a9a2af // indirect
+	github.com/google/uuid v1.6.0 // indirect
+	github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 // indirect
+	github.com/imdario/mergo v0.3.6 // indirect
+	github.com/inconshreveable/mousetrap v1.1.0 // indirect
+	github.com/josharian/intern v1.0.0 // indirect
+	github.com/json-iterator/go v1.1.12 // indirect
+	github.com/mailru/easyjson v0.7.7 // indirect
+	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
+	github.com/modern-go/reflect2 v1.0.2 // indirect
+	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
+	github.com/pkg/errors v0.9.1 // indirect
+	github.com/prometheus/client_golang v1.19.1 // indirect
+	github.com/prometheus/client_model v0.6.1 // indirect
+	github.com/prometheus/common v0.55.0 // indirect
+	github.com/prometheus/procfs v0.15.1 // indirect
+	github.com/spf13/cobra v1.8.1 // indirect
+	github.com/spf13/pflag v1.0.5 // indirect
+	github.com/stoewer/go-strcase v1.2.0 // indirect
+	github.com/x448/float16 v0.8.4 // indirect
+	go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0 // indirect
+	go.opentelemetry.io/otel v1.28.0 // indirect
+	go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.28.0 // indirect
+	go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.27.0 // indirect
+	go.opentelemetry.io/otel/metric v1.28.0 // indirect
+	go.opentelemetry.io/otel/sdk v1.28.0 // indirect
+	go.opentelemetry.io/otel/trace v1.28.0 // indirect
+	go.opentelemetry.io/proto/otlp v1.3.1 // indirect
+	go.uber.org/multierr v1.11.0 // indirect
+	go.uber.org/zap v1.26.0 // indirect
+	golang.org/x/exp v0.0.0-20230515195305-f3d0a9c9a5cc // indirect
+	golang.org/x/net v0.26.0 // indirect
+	golang.org/x/oauth2 v0.21.0 // indirect
+	golang.org/x/sync v0.7.0 // indirect
+	golang.org/x/sys v0.21.0 // indirect
+	golang.org/x/term v0.21.0 // indirect
+	golang.org/x/text v0.16.0 // indirect
+	golang.org/x/time v0.3.0 // indirect
+	golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d // indirect
+	gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
+	google.golang.org/genproto/googleapis/api v0.0.0-20240528184218-531527333157 // indirect
+	google.golang.org/genproto/googleapis/rpc v0.0.0-20240701130421-f6361c86f094 // indirect
+	google.golang.org/grpc v1.65.0 // indirect
+	google.golang.org/protobuf v1.34.2 // indirect
+	gopkg.in/inf.v0 v0.9.1 // indirect
+	gopkg.in/yaml.v2 v2.4.0 // indirect
+	gopkg.in/yaml.v3 v3.0.1 // indirect
+	k8s.io/api v0.31.0 // indirect
+	k8s.io/apiextensions-apiserver v0.31.0 // indirect
+	k8s.io/apiserver v0.31.0 // indirect
+	k8s.io/component-base v0.31.0 // indirect
+	k8s.io/klog/v2 v2.130.1 // indirect
+	k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 // indirect
+	k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 // indirect
+	sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.30.3 // indirect
+	sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
+	sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect
+	sigs.k8s.io/yaml v1.4.0 // indirect
+)
diff --git a/go.sum b/go.sum
new file mode 100644
index 0000000..0958667
--- /dev/null
+++ b/go.sum
@@ -0,0 +1,251 @@
+github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8TVTI=
+github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g=
+github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a h1:idn718Q4B6AGu/h5Sxe66HYVdqdGu2l9Iebqhi/AEoA=
+github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY=
+github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
+github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
+github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM=
+github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ=
+github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8=
+github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE=
+github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
+github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
+github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
+github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
+github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/emicklei/go-restful/v3 v3.11.0 h1:rAQeMHw1c7zTmncogyy8VvRZwtkmkZ4FxERmMY4rD+g=
+github.com/emicklei/go-restful/v3 v3.11.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc=
+github.com/evanphx/json-patch v0.5.2 h1:xVCHIVMUu1wtM/VkR9jVZ45N3FhZfYMMYGorLCR8P3k=
+github.com/evanphx/json-patch v0.5.2/go.mod h1:ZWS5hhDbVDyob71nXKNL0+PWn6ToqBHMikGIFbs31qQ=
+github.com/evanphx/json-patch/v5 v5.9.0 h1:kcBlZQbplgElYIlo/n1hJbls2z/1awpXxpRi0/FOJfg=
+github.com/evanphx/json-patch/v5 v5.9.0/go.mod h1:VNkHZ/282BpEyt/tObQO8s5CMPmYYq14uClGH4abBuQ=
+github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
+github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
+github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA=
+github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM=
+github.com/fxamacker/cbor/v2 v2.7.0 h1:iM5WgngdRBanHcxugY4JySA0nk1wZorNOpTgCMedv5E=
+github.com/fxamacker/cbor/v2 v2.7.0/go.mod h1:pxXPTn3joSm21Gbwsv0w9OSA2y1HFR9qXEeXQVeNoDQ=
+github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
+github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY=
+github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
+github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
+github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
+github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ=
+github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg=
+github.com/go-openapi/jsonpointer v0.19.6 h1:eCs3fxoIi3Wh6vtgmLTOjdhSpiqphQ+DaPn38N2ZdrE=
+github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs=
+github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE=
+github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k=
+github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14=
+github.com/go-openapi/swag v0.22.4 h1:QLMzNJnMGPRNDCbySlcj1x01tzU8/9LTTL9hZZZogBU=
+github.com/go-openapi/swag v0.22.4/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14=
+github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI=
+github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8=
+github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
+github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
+github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE=
+github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
+github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
+github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
+github.com/google/cel-go v0.20.1 h1:nDx9r8S3L4pE61eDdt8igGj8rf5kjYR3ILxWIpWNi84=
+github.com/google/cel-go v0.20.1/go.mod h1:kWcIzTsPX0zmQ+H3TirHstLLf9ep5QTsZBN9u4dOYLg=
+github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I=
+github.com/google/gnostic-models v0.6.8/go.mod h1:5n7qKqH0f5wFt+aWF8CW6pZLLNOfYuF5OpfBSENuI8U=
+github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
+github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
+github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
+github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
+github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0=
+github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
+github.com/google/pprof v0.0.0-20240525223248-4bfdf5a9a2af h1:kmjWCqn2qkEml422C2Rrd27c3VGxi6a/6HNq8QmHRKM=
+github.com/google/pprof v0.0.0-20240525223248-4bfdf5a9a2af/go.mod h1:K1liHPHnj73Fdn/EKuT8nrFqBihUSKXoLYU0BuatOYo=
+github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
+github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 h1:bkypFPDjIYGfCYD5mRBvpqxfYX1YCS1PXdKYWi8FsN0=
+github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0/go.mod h1:P+Lt/0by1T8bfcF3z737NnSbmxQAppXMRziHUxPOC8k=
+github.com/imdario/mergo v0.3.6 h1:xTNEAn+kxVO7dTZGu0CegyqKZmoWFI0rF8UxjlB2d28=
+github.com/imdario/mergo v0.3.6/go.mod h1:2EnlNZ0deacrJVfApfmtdGgDfMuh/nq6Ok1EcJh5FfA=
+github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
+github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
+github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
+github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
+github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
+github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
+github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
+github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
+github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
+github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
+github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
+github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
+github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
+github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
+github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
+github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
+github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
+github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
+github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
+github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
+github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
+github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
+github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
+github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
+github.com/onsi/ginkgo/v2 v2.19.0 h1:9Cnnf7UHo57Hy3k6/m5k3dRfGTMXGvxhHFvkDTCTpvA=
+github.com/onsi/ginkgo/v2 v2.19.0/go.mod h1:rlwLi9PilAFJ8jCg9UE1QP6VBpd6/xj3SRC0d6TU0To=
+github.com/onsi/gomega v1.33.1 h1:dsYjIxxSR755MDmKVsaFQTE22ChNBcuuTWgkUDSubOk=
+github.com/onsi/gomega v1.33.1/go.mod h1:U4R44UsT+9eLIaYRB2a5qajjtQYn0hauxvRm16AVYg0=
+github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
+github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
+github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/prometheus/client_golang v1.19.1 h1:wZWJDwK+NameRJuPGDhlnFgx8e8HN3XHQeLaYJFJBOE=
+github.com/prometheus/client_golang v1.19.1/go.mod h1:mP78NwGzrVks5S2H6ab8+ZZGJLZUq1hoULYBAYBw1Ho=
+github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E=
+github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY=
+github.com/prometheus/common v0.55.0 h1:KEi6DK7lXW/m7Ig5i47x0vRzuBsHuvJdi5ee6Y3G1dc=
+github.com/prometheus/common v0.55.0/go.mod h1:2SECS4xJG1kd8XF9IcM1gMX6510RAEL65zxzNImwdc8=
+github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc=
+github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk=
+github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8=
+github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4=
+github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
+github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM=
+github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y=
+github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
+github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
+github.com/stoewer/go-strcase v1.2.0 h1:Z2iHWqGXH00XYgqDmNgQbIBxf3wrNq0F3feEy0ainaU=
+github.com/stoewer/go-strcase v1.2.0/go.mod h1:IBiWB2sKIp3wVVQ3Y035++gc+knqhUQag1KpM8ahLw8=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
+github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
+github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
+github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA=
+github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
+github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
+github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
+github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
+github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
+github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
+github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
+github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
+go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0 h1:4K4tsIXefpVJtvA/8srF4V4y0akAoPHkIslgAkjixJA=
+go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0/go.mod h1:jjdQuTGVsXV4vSs+CJ2qYDeDPf9yIJV23qlIzBm73Vg=
+go.opentelemetry.io/otel v1.28.0 h1:/SqNcYk+idO0CxKEUOtKQClMK/MimZihKYMruSMViUo=
+go.opentelemetry.io/otel v1.28.0/go.mod h1:q68ijF8Fc8CnMHKyzqL6akLO46ePnjkgfIMIjUIX9z4=
+go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.28.0 h1:3Q/xZUyC1BBkualc9ROb4G8qkH90LXEIICcs5zv1OYY=
+go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.28.0/go.mod h1:s75jGIWA9OfCMzF0xr+ZgfrB5FEbbV7UuYo32ahUiFI=
+go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.27.0 h1:qFffATk0X+HD+f1Z8lswGiOQYKHRlzfmdJm0wEaVrFA=
+go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.27.0/go.mod h1:MOiCmryaYtc+V0Ei+Tx9o5S1ZjA7kzLucuVuyzBZloQ=
+go.opentelemetry.io/otel/metric v1.28.0 h1:f0HGvSl1KRAU1DLgLGFjrwVyismPlnuU6JD6bOeuA5Q=
+go.opentelemetry.io/otel/metric v1.28.0/go.mod h1:Fb1eVBFZmLVTMb6PPohq3TO9IIhUisDsbJoL/+uQW4s=
+go.opentelemetry.io/otel/sdk v1.28.0 h1:b9d7hIry8yZsgtbmM0DKyPWMMUMlK9NEKuIG4aBqWyE=
+go.opentelemetry.io/otel/sdk v1.28.0/go.mod h1:oYj7ClPUA7Iw3m+r7GeEjz0qckQRJK2B8zjcZEfu7Pg=
+go.opentelemetry.io/otel/trace v1.28.0 h1:GhQ9cUuQGmNDd5BTCP2dAvv75RdMxEfTmYejp+lkx9g=
+go.opentelemetry.io/otel/trace v1.28.0/go.mod h1:jPyXzNPg6da9+38HEwElrQiHlVMTnVfM3/yv2OlIHaI=
+go.opentelemetry.io/proto/otlp v1.3.1 h1:TrMUixzpM0yuc/znrFTP9MMRh8trP93mkCiDVeXrui0=
+go.opentelemetry.io/proto/otlp v1.3.1/go.mod h1:0X1WI4de4ZsLrrJNLAQbFeLCm3T7yBkR0XqQ7niQU+8=
+go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
+go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
+go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0=
+go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y=
+go.uber.org/zap v1.26.0 h1:sI7k6L95XOKS281NhVKOFCUNIvv9e0w4BF8N3u+tCRo=
+go.uber.org/zap v1.26.0/go.mod h1:dtElttAiwGvoJ/vj4IwHBS/gXsEu/pZ50mUIRWuG0so=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
+golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
+golang.org/x/exp v0.0.0-20230515195305-f3d0a9c9a5cc h1:mCRnTeVUjcrhlRmO0VK8a6k6Rrf6TF9htwo2pJVSjIU=
+golang.org/x/exp v0.0.0-20230515195305-f3d0a9c9a5cc/go.mod h1:V1LtkGg67GoY2N1AnLN78QLrzxkLyJw7RJb1gzOOz9w=
+golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
+golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
+golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
+golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ=
+golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE=
+golang.org/x/oauth2 v0.21.0 h1:tsimM75w1tF/uws5rbeHzIWxEqElMehnc+iW793zsZs=
+golang.org/x/oauth2 v0.21.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI=
+golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M=
+golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws=
+golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/term v0.21.0 h1:WVXCp+/EBEHOj53Rvu+7KiT/iElMrO8ACK16SMZ3jaA=
+golang.org/x/term v0.21.0/go.mod h1:ooXLefLobQVslOqselCNF4SxFAaoS6KujMbsGzSDmX0=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4=
+golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI=
+golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4=
+golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
+golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
+golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
+golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d h1:vU5i/LfpvrRCpgM/VPfJLg5KjxD3E+hfT1SH+d9zLwg=
+golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk=
+golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw=
+gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY=
+google.golang.org/genproto/googleapis/api v0.0.0-20240528184218-531527333157 h1:7whR9kGa5LUwFtpLm2ArCEejtnxlGeLbAyjFY8sGNFw=
+google.golang.org/genproto/googleapis/api v0.0.0-20240528184218-531527333157/go.mod h1:99sLkeliLXfdj2J75X3Ho+rrVCaJze0uwN7zDDkjPVU=
+google.golang.org/genproto/googleapis/rpc v0.0.0-20240701130421-f6361c86f094 h1:BwIjyKYGsK9dMCBOorzRri8MQwmi7mT9rGHsCEinZkA=
+google.golang.org/genproto/googleapis/rpc v0.0.0-20240701130421-f6361c86f094/go.mod h1:Ue6ibwXGpU+dqIcODieyLOcgj7z8+IcskoNIgZxtrFY=
+google.golang.org/grpc v1.65.0 h1:bs/cUb4lp1G5iImFFd3u5ixQzweKizoZJAwBNLR42lc=
+google.golang.org/grpc v1.65.0/go.mod h1:WgYC2ypjlB0EiQi6wdKixMqukr6lBc0Vo+oOgjrM5ZQ=
+google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg=
+google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
+gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
+gopkg.in/evanphx/json-patch.v4 v4.12.0 h1:n6jtcsulIzXPJaxegRbvFNNrZDjbij7ny3gmSPG+6V4=
+gopkg.in/evanphx/json-patch.v4 v4.12.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M=
+gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc=
+gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw=
+gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
+gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
+gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
+gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+k8s.io/api v0.31.0 h1:b9LiSjR2ym/SzTOlfMHm1tr7/21aD7fSkqgD/CVJBCo=
+k8s.io/api v0.31.0/go.mod h1:0YiFF+JfFxMM6+1hQei8FY8M7s1Mth+z/q7eF1aJkTE=
+k8s.io/apiextensions-apiserver v0.31.0 h1:fZgCVhGwsclj3qCw1buVXCV6khjRzKC5eCFt24kyLSk=
+k8s.io/apiextensions-apiserver v0.31.0/go.mod h1:b9aMDEYaEe5sdK+1T0KU78ApR/5ZVp4i56VacZYEHxk=
+k8s.io/apimachinery v0.31.0 h1:m9jOiSr3FoSSL5WO9bjm1n6B9KROYYgNZOb4tyZ1lBc=
+k8s.io/apimachinery v0.31.0/go.mod h1:rsPdaZJfTfLsNJSQzNHQvYoTmxhoOEofxtOsF3rtsMo=
+k8s.io/apiserver v0.31.0 h1:p+2dgJjy+bk+B1Csz+mc2wl5gHwvNkC9QJV+w55LVrY=
+k8s.io/apiserver v0.31.0/go.mod h1:KI9ox5Yu902iBnnyMmy7ajonhKnkeZYJhTZ/YI+WEMk=
+k8s.io/client-go v0.31.0 h1:QqEJzNjbN2Yv1H79SsS+SWnXkBgVu4Pj3CJQgbx0gI8=
+k8s.io/client-go v0.31.0/go.mod h1:Y9wvC76g4fLjmU0BA+rV+h2cncoadjvjjkkIGoTLcGU=
+k8s.io/component-base v0.31.0 h1:/KIzGM5EvPNQcYgwq5NwoQBaOlVFrghoVGr8lG6vNRs=
+k8s.io/component-base v0.31.0/go.mod h1:TYVuzI1QmN4L5ItVdMSXKvH7/DtvIuas5/mm8YT3rTo=
+k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk=
+k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE=
+k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 h1:BZqlfIlq5YbRMFko6/PM7FjZpUb45WallggurYhKGag=
+k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340/go.mod h1:yD4MZYeKMBwQKVht279WycxKyM84kkAx2DPrTXaeb98=
+k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 h1:pUdcCO1Lk/tbT5ztQWOBi5HBgbBP1J8+AsQnQCKsi8A=
+k8s.io/utils v0.0.0-20240711033017-18e509b52bc8/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
+sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.30.3 h1:2770sDpzrjjsAtVhSeUFseziht227YAWYHLGNM8QPwY=
+sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.30.3/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw=
+sigs.k8s.io/controller-runtime v0.19.1 h1:Son+Q40+Be3QWb+niBXAg2vFiYWolDjjRfO8hn/cxOk=
+sigs.k8s.io/controller-runtime v0.19.1/go.mod h1:iRmWllt8IlaLjvTTDLhRBXIEtkCK6hwVBJJsYS9Ajf4=
+sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo=
+sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0=
+sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+sGiqlzvrtq4=
+sigs.k8s.io/structured-merge-diff/v4 v4.4.1/go.mod h1:N8hJocpFajUSSeSJ9bOZ77VzejKZaXsTtZo4/u7Io08=
+sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E=
+sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY=
diff --git a/hack/boilerplate.go.txt b/hack/boilerplate.go.txt
new file mode 100644
index 0000000..ff72ff2
--- /dev/null
+++ b/hack/boilerplate.go.txt
@@ -0,0 +1,15 @@
+/*
+Copyright 2024.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
\ No newline at end of file
diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/e2e_suite_test.go
new file mode 100644
index 0000000..0b72e40
--- /dev/null
+++ b/test/e2e/e2e_suite_test.go
@@ -0,0 +1,120 @@
+/*
+Copyright 2024.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package e2e
+
+import (
+	"fmt"
+	"os"
+	"os/exec"
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	"github.com/NexusGPU/tensor-fusion-operator/test/utils"
+)
+
+var (
+	// Optional Environment Variables:
+	// - PROMETHEUS_INSTALL_SKIP=true: Skips Prometheus Operator installation during test setup.
+	// - CERT_MANAGER_INSTALL_SKIP=true: Skips CertManager installation during test setup.
+	// These variables are useful if Prometheus or CertManager is already installed, avoiding
+	// re-installation and conflicts.
+	skipPrometheusInstall  = os.Getenv("PROMETHEUS_INSTALL_SKIP") == "true"
+	skipCertManagerInstall = os.Getenv("CERT_MANAGER_INSTALL_SKIP") == "true"
+	// isPrometheusOperatorAlreadyInstalled will be set true when prometheus CRDs be found on the cluster
+	isPrometheusOperatorAlreadyInstalled = false
+	// isCertManagerAlreadyInstalled will be set true when CertManager CRDs be found on the cluster
+	isCertManagerAlreadyInstalled = false
+
+	// projectImage is the name of the image which will be build and loaded
+	// with the code source changes to be tested.
+	projectImage = "example.com/tensor-fusion-operator:v0.0.1"
+)
+
+// TestE2E runs the end-to-end (e2e) test suite for the project. These tests execute in an isolated,
+// temporary environment to validate project changes with the the purposed to be used in CI jobs.
+// The default setup requires Kind, builds/loads the Manager Docker image locally, and installs
+// CertManager and Prometheus.
+func TestE2E(t *testing.T) {
+	RegisterFailHandler(Fail)
+	_, _ = fmt.Fprintf(GinkgoWriter, "Starting tensor-fusion-operator integration test suite\n")
+	RunSpecs(t, "e2e suite")
+}
+
+var _ = BeforeSuite(func() {
+	By("Ensure that Prometheus is enabled")
+	_ = utils.UncommentCode("config/default/kustomization.yaml", "#- ../prometheus", "#")
+
+	By("generating files")
+	cmd := exec.Command("make", "generate")
+	_, err := utils.Run(cmd)
+	ExpectWithOffset(1, err).NotTo(HaveOccurred(), "Failed to run make generate")
+
+	By("generating manifests")
+	cmd = exec.Command("make", "manifests")
+	_, err = utils.Run(cmd)
+	ExpectWithOffset(1, err).NotTo(HaveOccurred(), "Failed to run make manifests")
+
+	By("building the manager(Operator) image")
+	cmd = exec.Command("make", "docker-build", fmt.Sprintf("IMG=%s", projectImage))
+	_, err = utils.Run(cmd)
+	ExpectWithOffset(1, err).NotTo(HaveOccurred(), "Failed to build the manager(Operator) image")
+
+	// TODO(user): If you want to change the e2e test vendor from Kind, ensure the image is
+	// built and available before running the tests. Also, remove the following block.
+	By("loading the manager(Operator) image on Kind")
+	err = utils.LoadImageToKindClusterWithName(projectImage)
+	ExpectWithOffset(1, err).NotTo(HaveOccurred(), "Failed to load the manager(Operator) image into Kind")
+
+	// The tests-e2e are intended to run on a temporary cluster that is created and destroyed for testing.
+	// To prevent errors when tests run in environments with Prometheus or CertManager already installed,
+	// we check for their presence before execution.
+	// Setup Prometheus and CertManager before the suite if not skipped and if not already installed
+	if !skipPrometheusInstall {
+		By("checking if prometheus is installed already")
+		isPrometheusOperatorAlreadyInstalled = utils.IsPrometheusCRDsInstalled()
+		if !isPrometheusOperatorAlreadyInstalled {
+			_, _ = fmt.Fprintf(GinkgoWriter, "Installing Prometheus Operator...\n")
+			Expect(utils.InstallPrometheusOperator()).To(Succeed(), "Failed to install Prometheus Operator")
+		} else {
+			_, _ = fmt.Fprintf(GinkgoWriter, "WARNING: Prometheus Operator is already installed. Skipping installation...\n")
+		}
+	}
+	if !skipCertManagerInstall {
+		By("checking if cert manager is installed already")
+		isCertManagerAlreadyInstalled = utils.IsCertManagerCRDsInstalled()
+		if !isCertManagerAlreadyInstalled {
+			_, _ = fmt.Fprintf(GinkgoWriter, "Installing CertManager...\n")
+			Expect(utils.InstallCertManager()).To(Succeed(), "Failed to install CertManager")
+		} else {
+			_, _ = fmt.Fprintf(GinkgoWriter, "WARNING: CertManager is already installed. Skipping installation...\n")
+		}
+	}
+})
+
+var _ = AfterSuite(func() {
+	// Teardown Prometheus and CertManager after the suite if not skipped and if they were not already installed
+	if !skipPrometheusInstall && !isPrometheusOperatorAlreadyInstalled {
+		_, _ = fmt.Fprintf(GinkgoWriter, "Uninstalling Prometheus Operator...\n")
+		utils.UninstallPrometheusOperator()
+	}
+	if !skipCertManagerInstall && !isCertManagerAlreadyInstalled {
+		_, _ = fmt.Fprintf(GinkgoWriter, "Uninstalling CertManager...\n")
+		utils.UninstallCertManager()
+	}
+})
diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go
new file mode 100644
index 0000000..a218480
--- /dev/null
+++ b/test/e2e/e2e_test.go
@@ -0,0 +1,307 @@
+/*
+Copyright 2024.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package e2e
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"time"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	"github.com/NexusGPU/tensor-fusion-operator/test/utils"
+)
+
+// namespace where the project is deployed in
+const namespace = "tensor-fusion-operator-system"
+
+// serviceAccountName created for the project
+const serviceAccountName = "tensor-fusion-operator-controller-manager"
+
+// metricsServiceName is the name of the metrics service of the project
+const metricsServiceName = "tensor-fusion-operator-controller-manager-metrics-service"
+
+// metricsRoleBindingName is the name of the RBAC that will be created to allow get the metrics data
+const metricsRoleBindingName = "tensor-fusion-operator-metrics-binding"
+
+var _ = Describe("Manager", Ordered, func() {
+	var controllerPodName string
+
+	// Before running the tests, set up the environment by creating the namespace,
+	// installing CRDs, and deploying the controller.
+	BeforeAll(func() {
+		By("creating manager namespace")
+		cmd := exec.Command("kubectl", "create", "ns", namespace)
+		_, err := utils.Run(cmd)
+		Expect(err).NotTo(HaveOccurred(), "Failed to create namespace")
+
+		By("installing CRDs")
+		cmd = exec.Command("make", "install")
+		_, err = utils.Run(cmd)
+		Expect(err).NotTo(HaveOccurred(), "Failed to install CRDs")
+
+		By("deploying the controller-manager")
+		cmd = exec.Command("make", "deploy", fmt.Sprintf("IMG=%s", projectImage))
+		_, err = utils.Run(cmd)
+		Expect(err).NotTo(HaveOccurred(), "Failed to deploy the controller-manager")
+	})
+
+	// After all tests have been executed, clean up by undeploying the controller, uninstalling CRDs,
+	// and deleting the namespace.
+	AfterAll(func() {
+		By("cleaning up the curl pod for metrics")
+		cmd := exec.Command("kubectl", "delete", "pod", "curl-metrics", "-n", namespace)
+		_, _ = utils.Run(cmd)
+
+		By("undeploying the controller-manager")
+		cmd = exec.Command("make", "undeploy")
+		_, _ = utils.Run(cmd)
+
+		By("uninstalling CRDs")
+		cmd = exec.Command("make", "uninstall")
+		_, _ = utils.Run(cmd)
+
+		By("removing manager namespace")
+		cmd = exec.Command("kubectl", "delete", "ns", namespace)
+		_, _ = utils.Run(cmd)
+	})
+
+	// After each test, check for failures and collect logs, events,
+	// and pod descriptions for debugging.
+	AfterEach(func() {
+		specReport := CurrentSpecReport()
+		if specReport.Failed() {
+			By("Fetching controller manager pod logs")
+			cmd := exec.Command("kubectl", "logs", controllerPodName, "-n", namespace)
+			controllerLogs, err := utils.Run(cmd)
+			if err == nil {
+				_, _ = fmt.Fprintf(GinkgoWriter, fmt.Sprintf("Controller logs:\n %s", controllerLogs))
+			} else {
+				_, _ = fmt.Fprintf(GinkgoWriter, fmt.Sprintf("Failed to get Controller logs: %s", err))
+			}
+
+			By("Fetching Kubernetes events")
+			cmd = exec.Command("kubectl", "get", "events", "-n", namespace, "--sort-by=.lastTimestamp")
+			eventsOutput, err := utils.Run(cmd)
+			if err == nil {
+				_, _ = fmt.Fprintf(GinkgoWriter, fmt.Sprintf("Kubernetes events:\n%s", eventsOutput))
+			} else {
+				_, _ = fmt.Fprintf(GinkgoWriter, fmt.Sprintf("Failed to get Kubernetes events: %s", err))
+			}
+
+			By("Fetching curl-metrics logs")
+			cmd = exec.Command("kubectl", "logs", "curl-metrics", "-n", namespace)
+			metricsOutput, err := utils.Run(cmd)
+			if err == nil {
+				_, _ = fmt.Fprintf(GinkgoWriter, fmt.Sprintf("Metrics logs:\n %s", metricsOutput))
+			} else {
+				_, _ = fmt.Fprintf(GinkgoWriter, fmt.Sprintf("Failed to get curl-metrics logs: %s", err))
+			}
+
+			By("Fetching controller manager pod description")
+			cmd = exec.Command("kubectl", "describe", "pod", controllerPodName, "-n", namespace)
+			podDescription, err := utils.Run(cmd)
+			if err == nil {
+				fmt.Println("Pod description:\n", podDescription)
+			} else {
+				fmt.Println("Failed to describe controller pod")
+			}
+		}
+	})
+
+	SetDefaultEventuallyTimeout(2 * time.Minute)
+	SetDefaultEventuallyPollingInterval(time.Second)
+
+	Context("Manager", func() {
+		It("should run successfully", func() {
+			By("validating that the controller-manager pod is running as expected")
+			verifyControllerUp := func(g Gomega) {
+				// Get the name of the controller-manager pod
+				cmd := exec.Command("kubectl", "get",
+					"pods", "-l", "control-plane=controller-manager",
+					"-o", "go-template={{ range .items }}"+
+						"{{ if not .metadata.deletionTimestamp }}"+
+						"{{ .metadata.name }}"+
+						"{{ \"\\n\" }}{{ end }}{{ end }}",
+					"-n", namespace,
+				)
+
+				podOutput, err := utils.Run(cmd)
+				g.Expect(err).NotTo(HaveOccurred(), "Failed to retrieve controller-manager pod information")
+				podNames := utils.GetNonEmptyLines(podOutput)
+				g.Expect(podNames).To(HaveLen(1), "expected 1 controller pod running")
+				controllerPodName = podNames[0]
+				g.Expect(controllerPodName).To(ContainSubstring("controller-manager"))
+
+				// Validate the pod's status
+				cmd = exec.Command("kubectl", "get",
+					"pods", controllerPodName, "-o", "jsonpath={.status.phase}",
+					"-n", namespace,
+				)
+				output, err := utils.Run(cmd)
+				g.Expect(err).NotTo(HaveOccurred())
+				g.Expect(output).To(Equal("Running"), "Incorrect controller-manager pod status")
+			}
+			Eventually(verifyControllerUp).Should(Succeed())
+		})
+
+		It("should ensure the metrics endpoint is serving metrics", func() {
+			By("creating a ClusterRoleBinding for the service account to allow access to metrics")
+			cmd := exec.Command("kubectl", "create", "clusterrolebinding", metricsRoleBindingName,
+				"--clusterrole=tensor-fusion-operator-metrics-reader",
+				fmt.Sprintf("--serviceaccount=%s:%s", namespace, serviceAccountName),
+			)
+			_, err := utils.Run(cmd)
+			Expect(err).NotTo(HaveOccurred(), "Failed to create ClusterRoleBinding")
+
+			By("validating that the metrics service is available")
+			cmd = exec.Command("kubectl", "get", "service", metricsServiceName, "-n", namespace)
+			_, err = utils.Run(cmd)
+			Expect(err).NotTo(HaveOccurred(), "Metrics service should exist")
+
+			By("validating that the ServiceMonitor for Prometheus is applied in the namespace")
+			cmd = exec.Command("kubectl", "get", "ServiceMonitor", "-n", namespace)
+			_, err = utils.Run(cmd)
+			Expect(err).NotTo(HaveOccurred(), "ServiceMonitor should exist")
+
+			By("getting the service account token")
+			token, err := serviceAccountToken()
+			Expect(err).NotTo(HaveOccurred())
+			Expect(token).NotTo(BeEmpty())
+
+			By("waiting for the metrics endpoint to be ready")
+			verifyMetricsEndpointReady := func(g Gomega) {
+				cmd := exec.Command("kubectl", "get", "endpoints", metricsServiceName, "-n", namespace)
+				output, err := utils.Run(cmd)
+				g.Expect(err).NotTo(HaveOccurred())
+				g.Expect(output).To(ContainSubstring("8443"), "Metrics endpoint is not ready")
+			}
+			Eventually(verifyMetricsEndpointReady).Should(Succeed())
+
+			By("verifying that the controller manager is serving the metrics server")
+			verifyMetricsServerStarted := func(g Gomega) {
+				cmd := exec.Command("kubectl", "logs", controllerPodName, "-n", namespace)
+				output, err := utils.Run(cmd)
+				g.Expect(err).NotTo(HaveOccurred())
+				g.Expect(output).To(ContainSubstring("controller-runtime.metrics\tServing metrics server"),
+					"Metrics server not yet started")
+			}
+			Eventually(verifyMetricsServerStarted).Should(Succeed())
+
+			By("creating the curl-metrics pod to access the metrics endpoint")
+			cmd = exec.Command("kubectl", "run", "curl-metrics", "--restart=Never",
+				"--namespace", namespace,
+				"--image=curlimages/curl:7.78.0",
+				"--", "/bin/sh", "-c", fmt.Sprintf(
+					"curl -v -k -H 'Authorization: Bearer %s' https://%s.%s.svc.cluster.local:8443/metrics",
+					token, metricsServiceName, namespace))
+			_, err = utils.Run(cmd)
+			Expect(err).NotTo(HaveOccurred(), "Failed to create curl-metrics pod")
+
+			By("waiting for the curl-metrics pod to complete.")
+			verifyCurlUp := func(g Gomega) {
+				cmd := exec.Command("kubectl", "get", "pods", "curl-metrics",
+					"-o", "jsonpath={.status.phase}",
+					"-n", namespace)
+				output, err := utils.Run(cmd)
+				g.Expect(err).NotTo(HaveOccurred())
+				g.Expect(output).To(Equal("Succeeded"), "curl pod in wrong status")
+			}
+			Eventually(verifyCurlUp, 5*time.Minute).Should(Succeed())
+
+			By("getting the metrics by checking curl-metrics logs")
+			metricsOutput := getMetricsOutput()
+			Expect(metricsOutput).To(ContainSubstring(
+				"controller_runtime_reconcile_total",
+			))
+		})
+
+		// +kubebuilder:scaffold:e2e-webhooks-checks
+
+		// TODO: Customize the e2e test suite with scenarios specific to your project.
+		// Consider applying sample/CR(s) and check their status and/or verifying
+		// the reconciliation by using the metrics, i.e.:
+		// metricsOutput := getMetricsOutput()
+		// Expect(metricsOutput).To(ContainSubstring(
+		//    fmt.Sprintf(`controller_runtime_reconcile_total{controller="%s",result="success"} 1`,
+		//    strings.ToLower(<Kind>),
+		// ))
+	})
+})
+
+// serviceAccountToken returns a token for the specified service account in the given namespace.
+// It uses the Kubernetes TokenRequest API to generate a token by directly sending a request
+// and parsing the resulting token from the API response.
+func serviceAccountToken() (string, error) {
+	const tokenRequestRawString = `{
+		"apiVersion": "authentication.k8s.io/v1",
+		"kind": "TokenRequest"
+	}`
+
+	// Temporary file to store the token request
+	secretName := fmt.Sprintf("%s-token-request", serviceAccountName)
+	tokenRequestFile := filepath.Join("/tmp", secretName)
+	err := os.WriteFile(tokenRequestFile, []byte(tokenRequestRawString), os.FileMode(0o644))
+	if err != nil {
+		return "", err
+	}
+
+	var out string
+	verifyTokenCreation := func(g Gomega) {
+		// Execute kubectl command to create the token
+		cmd := exec.Command("kubectl", "create", "--raw", fmt.Sprintf(
+			"/api/v1/namespaces/%s/serviceaccounts/%s/token",
+			namespace,
+			serviceAccountName,
+		), "-f", tokenRequestFile)
+
+		output, err := cmd.CombinedOutput()
+		g.Expect(err).NotTo(HaveOccurred())
+
+		// Parse the JSON output to extract the token
+		var token tokenRequest
+		err = json.Unmarshal([]byte(output), &token)
+		g.Expect(err).NotTo(HaveOccurred())
+
+		out = token.Status.Token
+	}
+	Eventually(verifyTokenCreation).Should(Succeed())
+
+	return out, err
+}
+
+// getMetricsOutput retrieves and returns the logs from the curl pod used to access the metrics endpoint.
+func getMetricsOutput() string {
+	By("getting the curl-metrics logs")
+	cmd := exec.Command("kubectl", "logs", "curl-metrics", "-n", namespace)
+	metricsOutput, err := utils.Run(cmd)
+	Expect(err).NotTo(HaveOccurred(), "Failed to retrieve logs from curl pod")
+	Expect(metricsOutput).To(ContainSubstring("< HTTP/1.1 200 OK"))
+	return metricsOutput
+}
+
+// tokenRequest is a simplified representation of the Kubernetes TokenRequest API response,
+// containing only the token field that we need to extract.
+type tokenRequest struct {
+	Status struct {
+		Token string `json:"token"`
+	} `json:"status"`
+}
diff --git a/test/utils/utils.go b/test/utils/utils.go
new file mode 100644
index 0000000..c3d51ce
--- /dev/null
+++ b/test/utils/utils.go
@@ -0,0 +1,251 @@
+/*
+Copyright 2024.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package utils
+
+import (
+	"bufio"
+	"bytes"
+	"fmt"
+	"os"
+	"os/exec"
+	"strings"
+
+	. "github.com/onsi/ginkgo/v2" //nolint:golint,revive
+)
+
+const (
+	prometheusOperatorVersion = "v0.77.1"
+	prometheusOperatorURL     = "https://github.com/prometheus-operator/prometheus-operator/" +
+		"releases/download/%s/bundle.yaml"
+
+	certmanagerVersion = "v1.16.0"
+	certmanagerURLTmpl = "https://github.com/jetstack/cert-manager/releases/download/%s/cert-manager.yaml"
+)
+
+func warnError(err error) {
+	_, _ = fmt.Fprintf(GinkgoWriter, "warning: %v\n", err)
+}
+
+// Run executes the provided command within this context
+func Run(cmd *exec.Cmd) (string, error) {
+	dir, _ := GetProjectDir()
+	cmd.Dir = dir
+
+	if err := os.Chdir(cmd.Dir); err != nil {
+		_, _ = fmt.Fprintf(GinkgoWriter, "chdir dir: %s\n", err)
+	}
+
+	cmd.Env = append(os.Environ(), "GO111MODULE=on")
+	command := strings.Join(cmd.Args, " ")
+	_, _ = fmt.Fprintf(GinkgoWriter, "running: %s\n", command)
+	output, err := cmd.CombinedOutput()
+	if err != nil {
+		return string(output), fmt.Errorf("%s failed with error: (%v) %s", command, err, string(output))
+	}
+
+	return string(output), nil
+}
+
+// InstallPrometheusOperator installs the prometheus Operator to be used to export the enabled metrics.
+func InstallPrometheusOperator() error {
+	url := fmt.Sprintf(prometheusOperatorURL, prometheusOperatorVersion)
+	cmd := exec.Command("kubectl", "create", "-f", url)
+	_, err := Run(cmd)
+	return err
+}
+
+// UninstallPrometheusOperator uninstalls the prometheus
+func UninstallPrometheusOperator() {
+	url := fmt.Sprintf(prometheusOperatorURL, prometheusOperatorVersion)
+	cmd := exec.Command("kubectl", "delete", "-f", url)
+	if _, err := Run(cmd); err != nil {
+		warnError(err)
+	}
+}
+
+// IsPrometheusCRDsInstalled checks if any Prometheus CRDs are installed
+// by verifying the existence of key CRDs related to Prometheus.
+func IsPrometheusCRDsInstalled() bool {
+	// List of common Prometheus CRDs
+	prometheusCRDs := []string{
+		"prometheuses.monitoring.coreos.com",
+		"prometheusrules.monitoring.coreos.com",
+		"prometheusagents.monitoring.coreos.com",
+	}
+
+	cmd := exec.Command("kubectl", "get", "crds", "-o", "custom-columns=NAME:.metadata.name")
+	output, err := Run(cmd)
+	if err != nil {
+		return false
+	}
+	crdList := GetNonEmptyLines(string(output))
+	for _, crd := range prometheusCRDs {
+		for _, line := range crdList {
+			if strings.Contains(line, crd) {
+				return true
+			}
+		}
+	}
+
+	return false
+}
+
+// UninstallCertManager uninstalls the cert manager
+func UninstallCertManager() {
+	url := fmt.Sprintf(certmanagerURLTmpl, certmanagerVersion)
+	cmd := exec.Command("kubectl", "delete", "-f", url)
+	if _, err := Run(cmd); err != nil {
+		warnError(err)
+	}
+}
+
+// InstallCertManager installs the cert manager bundle.
+func InstallCertManager() error {
+	url := fmt.Sprintf(certmanagerURLTmpl, certmanagerVersion)
+	cmd := exec.Command("kubectl", "apply", "-f", url)
+	if _, err := Run(cmd); err != nil {
+		return err
+	}
+	// Wait for cert-manager-webhook to be ready, which can take time if cert-manager
+	// was re-installed after uninstalling on a cluster.
+	cmd = exec.Command("kubectl", "wait", "deployment.apps/cert-manager-webhook",
+		"--for", "condition=Available",
+		"--namespace", "cert-manager",
+		"--timeout", "5m",
+	)
+
+	_, err := Run(cmd)
+	return err
+}
+
+// IsCertManagerCRDsInstalled checks if any Cert Manager CRDs are installed
+// by verifying the existence of key CRDs related to Cert Manager.
+func IsCertManagerCRDsInstalled() bool {
+	// List of common Cert Manager CRDs
+	certManagerCRDs := []string{
+		"certificates.cert-manager.io",
+		"issuers.cert-manager.io",
+		"clusterissuers.cert-manager.io",
+		"certificaterequests.cert-manager.io",
+		"orders.acme.cert-manager.io",
+		"challenges.acme.cert-manager.io",
+	}
+
+	// Execute the kubectl command to get all CRDs
+	cmd := exec.Command("kubectl", "get", "crds")
+	output, err := Run(cmd)
+	if err != nil {
+		return false
+	}
+
+	// Check if any of the Cert Manager CRDs are present
+	crdList := GetNonEmptyLines(string(output))
+	for _, crd := range certManagerCRDs {
+		for _, line := range crdList {
+			if strings.Contains(line, crd) {
+				return true
+			}
+		}
+	}
+
+	return false
+}
+
+// LoadImageToKindClusterWithName loads a local docker image to the kind cluster
+func LoadImageToKindClusterWithName(name string) error {
+	cluster := "kind"
+	if v, ok := os.LookupEnv("KIND_CLUSTER"); ok {
+		cluster = v
+	}
+	kindOptions := []string{"load", "docker-image", name, "--name", cluster}
+	cmd := exec.Command("kind", kindOptions...)
+	_, err := Run(cmd)
+	return err
+}
+
+// GetNonEmptyLines converts given command output string into individual objects
+// according to line breakers, and ignores the empty elements in it.
+func GetNonEmptyLines(output string) []string {
+	var res []string
+	elements := strings.Split(output, "\n")
+	for _, element := range elements {
+		if element != "" {
+			res = append(res, element)
+		}
+	}
+
+	return res
+}
+
+// GetProjectDir will return the directory where the project is
+func GetProjectDir() (string, error) {
+	wd, err := os.Getwd()
+	if err != nil {
+		return wd, err
+	}
+	wd = strings.Replace(wd, "/test/e2e", "", -1)
+	return wd, nil
+}
+
+// UncommentCode searches for target in the file and remove the comment prefix
+// of the target content. The target content may span multiple lines.
+func UncommentCode(filename, target, prefix string) error {
+	// false positive
+	// nolint:gosec
+	content, err := os.ReadFile(filename)
+	if err != nil {
+		return err
+	}
+	strContent := string(content)
+
+	idx := strings.Index(strContent, target)
+	if idx < 0 {
+		return fmt.Errorf("unable to find the code %s to be uncomment", target)
+	}
+
+	out := new(bytes.Buffer)
+	_, err = out.Write(content[:idx])
+	if err != nil {
+		return err
+	}
+
+	scanner := bufio.NewScanner(bytes.NewBufferString(target))
+	if !scanner.Scan() {
+		return nil
+	}
+	for {
+		_, err := out.WriteString(strings.TrimPrefix(scanner.Text(), prefix))
+		if err != nil {
+			return err
+		}
+		// Avoid writing a newline in case the previous line was the last in target.
+		if !scanner.Scan() {
+			break
+		}
+		if _, err := out.WriteString("\n"); err != nil {
+			return err
+		}
+	}
+
+	_, err = out.Write(content[idx+len(target):])
+	if err != nil {
+		return err
+	}
+	// false positive
+	// nolint:gosec
+	return os.WriteFile(filename, out.Bytes(), 0644)
+}

From 3e184e1cefe6c7c056866f42d9e0b4cae25a445f Mon Sep 17 00:00:00 2001
From: 0x5457 <0x5457@protonmail.com>
Date: Tue, 3 Dec 2024 07:16:14 +0000
Subject: [PATCH 02/22] feat: initialize definition of GPUNode and
 TensorFusionConnection

---
 PROJECT                                       |  19 ++
 api/v1/gpunode_types.go                       |  51 ++++
 api/v1/groupversion_info.go                   |  36 +++
 api/v1/tensorfusionconnection_types.go        |  77 ++++++
 api/v1/zz_generated.deepcopy.go               | 224 ++++++++++++++++++
 cmd/main.go                                   |  18 ++
 ...r-fusion.ai.tensor-fusion.ai_gpunodes.yaml |  86 +++++++
 ...sor-fusion.ai_tensorfusionconnections.yaml | 107 +++++++++
 config/crd/kustomization.yaml                 |  21 ++
 config/crd/kustomizeconfig.yaml               |  19 ++
 config/default/kustomization.yaml             |   2 +-
 config/rbac/gpunode_editor_role.yaml          |  27 +++
 config/rbac/gpunode_viewer_role.yaml          |  23 ++
 config/rbac/kustomization.yaml                |   9 +
 config/rbac/role.yaml                         |  36 ++-
 .../tensorfusionconnection_editor_role.yaml   |  27 +++
 .../tensorfusionconnection_viewer_role.yaml   |  23 ++
 config/samples/kustomization.yaml             |   5 +
 .../samples/tensor-fusion.ai_v1_gpunode.yaml  |   9 +
 ...r-fusion.ai_v1_tensorfusionconnection.yaml |   9 +
 internal/controller/gpunode_controller.go     |  63 +++++
 .../controller/gpunode_controller_test.go     |  84 +++++++
 internal/controller/suite_test.go             |  96 ++++++++
 .../tensorfusionconnection_controller.go      |  63 +++++
 .../tensorfusionconnection_controller_test.go |  84 +++++++
 25 files changed, 1211 insertions(+), 7 deletions(-)
 create mode 100644 api/v1/gpunode_types.go
 create mode 100644 api/v1/groupversion_info.go
 create mode 100644 api/v1/tensorfusionconnection_types.go
 create mode 100644 api/v1/zz_generated.deepcopy.go
 create mode 100644 config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_gpunodes.yaml
 create mode 100644 config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_tensorfusionconnections.yaml
 create mode 100644 config/crd/kustomization.yaml
 create mode 100644 config/crd/kustomizeconfig.yaml
 create mode 100644 config/rbac/gpunode_editor_role.yaml
 create mode 100644 config/rbac/gpunode_viewer_role.yaml
 create mode 100644 config/rbac/tensorfusionconnection_editor_role.yaml
 create mode 100644 config/rbac/tensorfusionconnection_viewer_role.yaml
 create mode 100644 config/samples/kustomization.yaml
 create mode 100644 config/samples/tensor-fusion.ai_v1_gpunode.yaml
 create mode 100644 config/samples/tensor-fusion.ai_v1_tensorfusionconnection.yaml
 create mode 100644 internal/controller/gpunode_controller.go
 create mode 100644 internal/controller/gpunode_controller_test.go
 create mode 100644 internal/controller/suite_test.go
 create mode 100644 internal/controller/tensorfusionconnection_controller.go
 create mode 100644 internal/controller/tensorfusionconnection_controller_test.go

diff --git a/PROJECT b/PROJECT
index cb60042..80dae4a 100644
--- a/PROJECT
+++ b/PROJECT
@@ -7,4 +7,23 @@ layout:
 - go.kubebuilder.io/v4
 projectName: tensor-fusion-operator
 repo: github.com/NexusGPU/tensor-fusion-operator
+resources:
+- api:
+    crdVersion: v1
+    namespaced: true
+  controller: true
+  domain: tensor-fusion.ai
+  group: tensor-fusion.ai
+  kind: TensorFusionConnection
+  path: github.com/NexusGPU/tensor-fusion-operator/api/v1
+  version: v1
+- api:
+    crdVersion: v1
+    namespaced: true
+  controller: true
+  domain: tensor-fusion.ai
+  group: tensor-fusion.ai
+  kind: GPUNode
+  path: github.com/NexusGPU/tensor-fusion-operator/api/v1
+  version: v1
 version: "3"
diff --git a/api/v1/gpunode_types.go b/api/v1/gpunode_types.go
new file mode 100644
index 0000000..45524cd
--- /dev/null
+++ b/api/v1/gpunode_types.go
@@ -0,0 +1,51 @@
+/*
+Copyright 2024.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package v1
+
+import (
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+)
+
+// GPUNodeStatus defines the observed state of GPUNode.
+type GPUNodeStatus struct {
+	Capacity Resource `json:"capacity"`
+	Used     Resource `json:"used"`
+}
+
+// +kubebuilder:object:root=true
+// +kubebuilder:subresource:status
+
+// GPUNode is the Schema for the gpunodes API.
+type GPUNode struct {
+	metav1.TypeMeta   `json:",inline"`
+	metav1.ObjectMeta `json:"metadata,omitempty"`
+
+	Status GPUNodeStatus `json:"status,omitempty"`
+}
+
+// +kubebuilder:object:root=true
+
+// GPUNodeList contains a list of GPUNode.
+type GPUNodeList struct {
+	metav1.TypeMeta `json:",inline"`
+	metav1.ListMeta `json:"metadata,omitempty"`
+	Items           []GPUNode `json:"items"`
+}
+
+func init() {
+	SchemeBuilder.Register(&GPUNode{}, &GPUNodeList{})
+}
diff --git a/api/v1/groupversion_info.go b/api/v1/groupversion_info.go
new file mode 100644
index 0000000..9172ec6
--- /dev/null
+++ b/api/v1/groupversion_info.go
@@ -0,0 +1,36 @@
+/*
+Copyright 2024.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package v1 contains API Schema definitions for the tensor-fusion.ai v1 API group.
+// +kubebuilder:object:generate=true
+// +groupName=tensor-fusion.ai.tensor-fusion.ai
+package v1
+
+import (
+	"k8s.io/apimachinery/pkg/runtime/schema"
+	"sigs.k8s.io/controller-runtime/pkg/scheme"
+)
+
+var (
+	// GroupVersion is group version used to register these objects.
+	GroupVersion = schema.GroupVersion{Group: "tensor-fusion.ai.tensor-fusion.ai", Version: "v1"}
+
+	// SchemeBuilder is used to add go types to the GroupVersionKind scheme.
+	SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion}
+
+	// AddToScheme adds the types in this group-version to the given scheme.
+	AddToScheme = SchemeBuilder.AddToScheme
+)
diff --git a/api/v1/tensorfusionconnection_types.go b/api/v1/tensorfusionconnection_types.go
new file mode 100644
index 0000000..955f227
--- /dev/null
+++ b/api/v1/tensorfusionconnection_types.go
@@ -0,0 +1,77 @@
+/*
+Copyright 2024.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package v1
+
+import (
+	"k8s.io/apimachinery/pkg/api/resource"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+)
+
+type Resource struct {
+	Tflops resource.Quantity `json:"tflops"`
+	Vram   resource.Quantity `json:"vram"`
+}
+
+type Resources struct {
+	Request Resource `json:"request"`
+	Limit   Resource `json:"limit"`
+}
+
+// TensorFusionConnectionSpec defines the desired state of TensorFusionConnection.
+type TensorFusionConnectionSpec struct {
+	Resources Resources `json:"resources"`
+}
+
+type TensorFusionConnectionPhase string
+
+// These are the valid phases of a GpuConnection.
+const (
+	TensorFusionConnectionPending TensorFusionConnectionPhase = "Pending"
+	TensorFusionConnectionRunning TensorFusionConnectionPhase = "Running"
+)
+
+// TensorFusionConnectionStatus defines the observed state of TensorFusionConnection.
+type TensorFusionConnectionStatus struct {
+	Phase         TensorFusionConnectionPhase `json:"phase"`
+	ConnectionURL string                      `json:"connectionURL"`
+	QosClass      string                      `json:"qosClass"`
+}
+
+// +kubebuilder:object:root=true
+// +kubebuilder:subresource:status
+
+// TensorFusionConnection is the Schema for the tensorfusionconnections API.
+type TensorFusionConnection struct {
+	metav1.TypeMeta   `json:",inline"`
+	metav1.ObjectMeta `json:"metadata,omitempty"`
+
+	Spec   TensorFusionConnectionSpec   `json:"spec,omitempty"`
+	Status TensorFusionConnectionStatus `json:"status,omitempty"`
+}
+
+// +kubebuilder:object:root=true
+
+// TensorFusionConnectionList contains a list of TensorFusionConnection.
+type TensorFusionConnectionList struct {
+	metav1.TypeMeta `json:",inline"`
+	metav1.ListMeta `json:"metadata,omitempty"`
+	Items           []TensorFusionConnection `json:"items"`
+}
+
+func init() {
+	SchemeBuilder.Register(&TensorFusionConnection{}, &TensorFusionConnectionList{})
+}
diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go
new file mode 100644
index 0000000..841b343
--- /dev/null
+++ b/api/v1/zz_generated.deepcopy.go
@@ -0,0 +1,224 @@
+//go:build !ignore_autogenerated
+
+/*
+Copyright 2024.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Code generated by controller-gen. DO NOT EDIT.
+
+package v1
+
+import (
+	runtime "k8s.io/apimachinery/pkg/runtime"
+)
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *GPUNode) DeepCopyInto(out *GPUNode) {
+	*out = *in
+	out.TypeMeta = in.TypeMeta
+	in.ObjectMeta.DeepCopyInto(&out.ObjectMeta)
+	in.Status.DeepCopyInto(&out.Status)
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNode.
+func (in *GPUNode) DeepCopy() *GPUNode {
+	if in == nil {
+		return nil
+	}
+	out := new(GPUNode)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
+func (in *GPUNode) DeepCopyObject() runtime.Object {
+	if c := in.DeepCopy(); c != nil {
+		return c
+	}
+	return nil
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *GPUNodeList) DeepCopyInto(out *GPUNodeList) {
+	*out = *in
+	out.TypeMeta = in.TypeMeta
+	in.ListMeta.DeepCopyInto(&out.ListMeta)
+	if in.Items != nil {
+		in, out := &in.Items, &out.Items
+		*out = make([]GPUNode, len(*in))
+		for i := range *in {
+			(*in)[i].DeepCopyInto(&(*out)[i])
+		}
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeList.
+func (in *GPUNodeList) DeepCopy() *GPUNodeList {
+	if in == nil {
+		return nil
+	}
+	out := new(GPUNodeList)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
+func (in *GPUNodeList) DeepCopyObject() runtime.Object {
+	if c := in.DeepCopy(); c != nil {
+		return c
+	}
+	return nil
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *GPUNodeStatus) DeepCopyInto(out *GPUNodeStatus) {
+	*out = *in
+	in.Capacity.DeepCopyInto(&out.Capacity)
+	in.Used.DeepCopyInto(&out.Used)
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeStatus.
+func (in *GPUNodeStatus) DeepCopy() *GPUNodeStatus {
+	if in == nil {
+		return nil
+	}
+	out := new(GPUNodeStatus)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *Resource) DeepCopyInto(out *Resource) {
+	*out = *in
+	out.Tflops = in.Tflops.DeepCopy()
+	out.Vram = in.Vram.DeepCopy()
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Resource.
+func (in *Resource) DeepCopy() *Resource {
+	if in == nil {
+		return nil
+	}
+	out := new(Resource)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *Resources) DeepCopyInto(out *Resources) {
+	*out = *in
+	in.Request.DeepCopyInto(&out.Request)
+	in.Limit.DeepCopyInto(&out.Limit)
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Resources.
+func (in *Resources) DeepCopy() *Resources {
+	if in == nil {
+		return nil
+	}
+	out := new(Resources)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *TensorFusionConnection) DeepCopyInto(out *TensorFusionConnection) {
+	*out = *in
+	out.TypeMeta = in.TypeMeta
+	in.ObjectMeta.DeepCopyInto(&out.ObjectMeta)
+	in.Spec.DeepCopyInto(&out.Spec)
+	out.Status = in.Status
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TensorFusionConnection.
+func (in *TensorFusionConnection) DeepCopy() *TensorFusionConnection {
+	if in == nil {
+		return nil
+	}
+	out := new(TensorFusionConnection)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
+func (in *TensorFusionConnection) DeepCopyObject() runtime.Object {
+	if c := in.DeepCopy(); c != nil {
+		return c
+	}
+	return nil
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *TensorFusionConnectionList) DeepCopyInto(out *TensorFusionConnectionList) {
+	*out = *in
+	out.TypeMeta = in.TypeMeta
+	in.ListMeta.DeepCopyInto(&out.ListMeta)
+	if in.Items != nil {
+		in, out := &in.Items, &out.Items
+		*out = make([]TensorFusionConnection, len(*in))
+		for i := range *in {
+			(*in)[i].DeepCopyInto(&(*out)[i])
+		}
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TensorFusionConnectionList.
+func (in *TensorFusionConnectionList) DeepCopy() *TensorFusionConnectionList {
+	if in == nil {
+		return nil
+	}
+	out := new(TensorFusionConnectionList)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
+func (in *TensorFusionConnectionList) DeepCopyObject() runtime.Object {
+	if c := in.DeepCopy(); c != nil {
+		return c
+	}
+	return nil
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *TensorFusionConnectionSpec) DeepCopyInto(out *TensorFusionConnectionSpec) {
+	*out = *in
+	in.Resources.DeepCopyInto(&out.Resources)
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TensorFusionConnectionSpec.
+func (in *TensorFusionConnectionSpec) DeepCopy() *TensorFusionConnectionSpec {
+	if in == nil {
+		return nil
+	}
+	out := new(TensorFusionConnectionSpec)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *TensorFusionConnectionStatus) DeepCopyInto(out *TensorFusionConnectionStatus) {
+	*out = *in
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TensorFusionConnectionStatus.
+func (in *TensorFusionConnectionStatus) DeepCopy() *TensorFusionConnectionStatus {
+	if in == nil {
+		return nil
+	}
+	out := new(TensorFusionConnectionStatus)
+	in.DeepCopyInto(out)
+	return out
+}
diff --git a/cmd/main.go b/cmd/main.go
index 8992f96..4d3856e 100644
--- a/cmd/main.go
+++ b/cmd/main.go
@@ -34,6 +34,9 @@ import (
 	"sigs.k8s.io/controller-runtime/pkg/metrics/filters"
 	metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
 	"sigs.k8s.io/controller-runtime/pkg/webhook"
+
+	tensorfusionaiv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
+	"github.com/NexusGPU/tensor-fusion-operator/internal/controller"
 	// +kubebuilder:scaffold:imports
 )
 
@@ -45,6 +48,7 @@ var (
 func init() {
 	utilruntime.Must(clientgoscheme.AddToScheme(scheme))
 
+	utilruntime.Must(tensorfusionaiv1.AddToScheme(scheme))
 	// +kubebuilder:scaffold:scheme
 }
 
@@ -138,6 +142,20 @@ func main() {
 		os.Exit(1)
 	}
 
+	if err = (&controller.TensorFusionConnectionReconciler{
+		Client: mgr.GetClient(),
+		Scheme: mgr.GetScheme(),
+	}).SetupWithManager(mgr); err != nil {
+		setupLog.Error(err, "unable to create controller", "controller", "TensorFusionConnection")
+		os.Exit(1)
+	}
+	if err = (&controller.GPUNodeReconciler{
+		Client: mgr.GetClient(),
+		Scheme: mgr.GetScheme(),
+	}).SetupWithManager(mgr); err != nil {
+		setupLog.Error(err, "unable to create controller", "controller", "GPUNode")
+		os.Exit(1)
+	}
 	// +kubebuilder:scaffold:builder
 
 	if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
diff --git a/config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_gpunodes.yaml b/config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_gpunodes.yaml
new file mode 100644
index 0000000..b3a3b46
--- /dev/null
+++ b/config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_gpunodes.yaml
@@ -0,0 +1,86 @@
+---
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+  annotations:
+    controller-gen.kubebuilder.io/version: v0.16.4
+  name: gpunodes.tensor-fusion.ai.tensor-fusion.ai
+spec:
+  group: tensor-fusion.ai.tensor-fusion.ai
+  names:
+    kind: GPUNode
+    listKind: GPUNodeList
+    plural: gpunodes
+    singular: gpunode
+  scope: Namespaced
+  versions:
+  - name: v1
+    schema:
+      openAPIV3Schema:
+        description: GPUNode is the Schema for the gpunodes API.
+        properties:
+          apiVersion:
+            description: |-
+              APIVersion defines the versioned schema of this representation of an object.
+              Servers should convert recognized schemas to the latest internal value, and
+              may reject unrecognized values.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
+            type: string
+          kind:
+            description: |-
+              Kind is a string value representing the REST resource this object represents.
+              Servers may infer this from the endpoint the client submits requests to.
+              Cannot be updated.
+              In CamelCase.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+            type: string
+          metadata:
+            type: object
+          status:
+            description: GPUNodeStatus defines the observed state of GPUNode.
+            properties:
+              capacity:
+                properties:
+                  tflops:
+                    anyOf:
+                    - type: integer
+                    - type: string
+                    pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                    x-kubernetes-int-or-string: true
+                  vram:
+                    anyOf:
+                    - type: integer
+                    - type: string
+                    pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                    x-kubernetes-int-or-string: true
+                required:
+                - tflops
+                - vram
+                type: object
+              used:
+                properties:
+                  tflops:
+                    anyOf:
+                    - type: integer
+                    - type: string
+                    pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                    x-kubernetes-int-or-string: true
+                  vram:
+                    anyOf:
+                    - type: integer
+                    - type: string
+                    pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                    x-kubernetes-int-or-string: true
+                required:
+                - tflops
+                - vram
+                type: object
+            required:
+            - capacity
+            - used
+            type: object
+        type: object
+    served: true
+    storage: true
+    subresources:
+      status: {}
diff --git a/config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_tensorfusionconnections.yaml b/config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_tensorfusionconnections.yaml
new file mode 100644
index 0000000..135776a
--- /dev/null
+++ b/config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_tensorfusionconnections.yaml
@@ -0,0 +1,107 @@
+---
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+  annotations:
+    controller-gen.kubebuilder.io/version: v0.16.4
+  name: tensorfusionconnections.tensor-fusion.ai.tensor-fusion.ai
+spec:
+  group: tensor-fusion.ai.tensor-fusion.ai
+  names:
+    kind: TensorFusionConnection
+    listKind: TensorFusionConnectionList
+    plural: tensorfusionconnections
+    singular: tensorfusionconnection
+  scope: Namespaced
+  versions:
+  - name: v1
+    schema:
+      openAPIV3Schema:
+        description: TensorFusionConnection is the Schema for the tensorfusionconnections
+          API.
+        properties:
+          apiVersion:
+            description: |-
+              APIVersion defines the versioned schema of this representation of an object.
+              Servers should convert recognized schemas to the latest internal value, and
+              may reject unrecognized values.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
+            type: string
+          kind:
+            description: |-
+              Kind is a string value representing the REST resource this object represents.
+              Servers may infer this from the endpoint the client submits requests to.
+              Cannot be updated.
+              In CamelCase.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+            type: string
+          metadata:
+            type: object
+          spec:
+            description: TensorFusionConnectionSpec defines the desired state of TensorFusionConnection.
+            properties:
+              resources:
+                properties:
+                  limit:
+                    properties:
+                      tflops:
+                        anyOf:
+                        - type: integer
+                        - type: string
+                        pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                        x-kubernetes-int-or-string: true
+                      vram:
+                        anyOf:
+                        - type: integer
+                        - type: string
+                        pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                        x-kubernetes-int-or-string: true
+                    required:
+                    - tflops
+                    - vram
+                    type: object
+                  request:
+                    properties:
+                      tflops:
+                        anyOf:
+                        - type: integer
+                        - type: string
+                        pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                        x-kubernetes-int-or-string: true
+                      vram:
+                        anyOf:
+                        - type: integer
+                        - type: string
+                        pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                        x-kubernetes-int-or-string: true
+                    required:
+                    - tflops
+                    - vram
+                    type: object
+                required:
+                - limit
+                - request
+                type: object
+            required:
+            - resources
+            type: object
+          status:
+            description: TensorFusionConnectionStatus defines the observed state of
+              TensorFusionConnection.
+            properties:
+              connectionURL:
+                type: string
+              phase:
+                type: string
+              qosClass:
+                type: string
+            required:
+            - connectionURL
+            - phase
+            - qosClass
+            type: object
+        type: object
+    served: true
+    storage: true
+    subresources:
+      status: {}
diff --git a/config/crd/kustomization.yaml b/config/crd/kustomization.yaml
new file mode 100644
index 0000000..127a6ba
--- /dev/null
+++ b/config/crd/kustomization.yaml
@@ -0,0 +1,21 @@
+# This kustomization.yaml is not intended to be run by itself,
+# since it depends on service name and namespace that are out of this kustomize package.
+# It should be run by config/default
+resources:
+- bases/tensor-fusion.ai.tensor-fusion.ai_tensorfusionconnections.yaml
+- bases/tensor-fusion.ai.tensor-fusion.ai_gpunodes.yaml
+# +kubebuilder:scaffold:crdkustomizeresource
+
+patches:
+# [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix.
+# patches here are for enabling the conversion webhook for each CRD
+# +kubebuilder:scaffold:crdkustomizewebhookpatch
+
+# [CERTMANAGER] To enable cert-manager, uncomment all the sections with [CERTMANAGER] prefix.
+# patches here are for enabling the CA injection for each CRD
+# +kubebuilder:scaffold:crdkustomizecainjectionpatch
+
+# [WEBHOOK] To enable webhook, uncomment the following section
+# the following config is for teaching kustomize how to do kustomization for CRDs.
+#configurations:
+#- kustomizeconfig.yaml
diff --git a/config/crd/kustomizeconfig.yaml b/config/crd/kustomizeconfig.yaml
new file mode 100644
index 0000000..ec5c150
--- /dev/null
+++ b/config/crd/kustomizeconfig.yaml
@@ -0,0 +1,19 @@
+# This file is for teaching kustomize how to substitute name and namespace reference in CRD
+nameReference:
+- kind: Service
+  version: v1
+  fieldSpecs:
+  - kind: CustomResourceDefinition
+    version: v1
+    group: apiextensions.k8s.io
+    path: spec/conversion/webhook/clientConfig/service/name
+
+namespace:
+- kind: CustomResourceDefinition
+  version: v1
+  group: apiextensions.k8s.io
+  path: spec/conversion/webhook/clientConfig/service/namespace
+  create: false
+
+varReference:
+- path: metadata/annotations
diff --git a/config/default/kustomization.yaml b/config/default/kustomization.yaml
index c27f571..4f303ca 100644
--- a/config/default/kustomization.yaml
+++ b/config/default/kustomization.yaml
@@ -15,7 +15,7 @@ namePrefix: tensor-fusion-operator-
 #    someName: someValue
 
 resources:
-#- ../crd
+- ../crd
 - ../rbac
 - ../manager
 # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in
diff --git a/config/rbac/gpunode_editor_role.yaml b/config/rbac/gpunode_editor_role.yaml
new file mode 100644
index 0000000..11c1526
--- /dev/null
+++ b/config/rbac/gpunode_editor_role.yaml
@@ -0,0 +1,27 @@
+# permissions for end users to edit gpunodes.
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/name: tensor-fusion-operator
+    app.kubernetes.io/managed-by: kustomize
+  name: gpunode-editor-role
+rules:
+- apiGroups:
+  - tensor-fusion.ai.tensor-fusion.ai
+  resources:
+  - gpunodes
+  verbs:
+  - create
+  - delete
+  - get
+  - list
+  - patch
+  - update
+  - watch
+- apiGroups:
+  - tensor-fusion.ai.tensor-fusion.ai
+  resources:
+  - gpunodes/status
+  verbs:
+  - get
diff --git a/config/rbac/gpunode_viewer_role.yaml b/config/rbac/gpunode_viewer_role.yaml
new file mode 100644
index 0000000..a4808a0
--- /dev/null
+++ b/config/rbac/gpunode_viewer_role.yaml
@@ -0,0 +1,23 @@
+# permissions for end users to view gpunodes.
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/name: tensor-fusion-operator
+    app.kubernetes.io/managed-by: kustomize
+  name: gpunode-viewer-role
+rules:
+- apiGroups:
+  - tensor-fusion.ai.tensor-fusion.ai
+  resources:
+  - gpunodes
+  verbs:
+  - get
+  - list
+  - watch
+- apiGroups:
+  - tensor-fusion.ai.tensor-fusion.ai
+  resources:
+  - gpunodes/status
+  verbs:
+  - get
diff --git a/config/rbac/kustomization.yaml b/config/rbac/kustomization.yaml
index 5619aa0..0bb7cfe 100644
--- a/config/rbac/kustomization.yaml
+++ b/config/rbac/kustomization.yaml
@@ -18,3 +18,12 @@ resources:
 - metrics_auth_role.yaml
 - metrics_auth_role_binding.yaml
 - metrics_reader_role.yaml
+# For each CRD, "Editor" and "Viewer" roles are scaffolded by
+# default, aiding admins in cluster management. Those roles are
+# not used by the Project itself. You can comment the following lines
+# if you do not want those helpers be installed with your Project.
+- gpunode_editor_role.yaml
+- gpunode_viewer_role.yaml
+- tensorfusionconnection_editor_role.yaml
+- tensorfusionconnection_viewer_role.yaml
+
diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml
index 7454ff6..b5d3369 100644
--- a/config/rbac/role.yaml
+++ b/config/rbac/role.yaml
@@ -1,11 +1,35 @@
+---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
 metadata:
-  labels:
-    app.kubernetes.io/name: tensor-fusion-operator
-    app.kubernetes.io/managed-by: kustomize
   name: manager-role
 rules:
-- apiGroups: [""]
-  resources: ["pods"]
-  verbs: ["get", "list", "watch"]
+- apiGroups:
+  - tensor-fusion.ai.tensor-fusion.ai
+  resources:
+  - gpunodes
+  - tensorfusionconnections
+  verbs:
+  - create
+  - delete
+  - get
+  - list
+  - patch
+  - update
+  - watch
+- apiGroups:
+  - tensor-fusion.ai.tensor-fusion.ai
+  resources:
+  - gpunodes/finalizers
+  - tensorfusionconnections/finalizers
+  verbs:
+  - update
+- apiGroups:
+  - tensor-fusion.ai.tensor-fusion.ai
+  resources:
+  - gpunodes/status
+  - tensorfusionconnections/status
+  verbs:
+  - get
+  - patch
+  - update
diff --git a/config/rbac/tensorfusionconnection_editor_role.yaml b/config/rbac/tensorfusionconnection_editor_role.yaml
new file mode 100644
index 0000000..d7627ed
--- /dev/null
+++ b/config/rbac/tensorfusionconnection_editor_role.yaml
@@ -0,0 +1,27 @@
+# permissions for end users to edit tensorfusionconnections.
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/name: tensor-fusion-operator
+    app.kubernetes.io/managed-by: kustomize
+  name: tensorfusionconnection-editor-role
+rules:
+- apiGroups:
+  - tensor-fusion.ai.tensor-fusion.ai
+  resources:
+  - tensorfusionconnections
+  verbs:
+  - create
+  - delete
+  - get
+  - list
+  - patch
+  - update
+  - watch
+- apiGroups:
+  - tensor-fusion.ai.tensor-fusion.ai
+  resources:
+  - tensorfusionconnections/status
+  verbs:
+  - get
diff --git a/config/rbac/tensorfusionconnection_viewer_role.yaml b/config/rbac/tensorfusionconnection_viewer_role.yaml
new file mode 100644
index 0000000..498b61e
--- /dev/null
+++ b/config/rbac/tensorfusionconnection_viewer_role.yaml
@@ -0,0 +1,23 @@
+# permissions for end users to view tensorfusionconnections.
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/name: tensor-fusion-operator
+    app.kubernetes.io/managed-by: kustomize
+  name: tensorfusionconnection-viewer-role
+rules:
+- apiGroups:
+  - tensor-fusion.ai.tensor-fusion.ai
+  resources:
+  - tensorfusionconnections
+  verbs:
+  - get
+  - list
+  - watch
+- apiGroups:
+  - tensor-fusion.ai.tensor-fusion.ai
+  resources:
+  - tensorfusionconnections/status
+  verbs:
+  - get
diff --git a/config/samples/kustomization.yaml b/config/samples/kustomization.yaml
new file mode 100644
index 0000000..022e343
--- /dev/null
+++ b/config/samples/kustomization.yaml
@@ -0,0 +1,5 @@
+## Append samples of your project ##
+resources:
+- tensor-fusion.ai_v1_tensorfusionconnection.yaml
+- tensor-fusion.ai_v1_gpunode.yaml
+# +kubebuilder:scaffold:manifestskustomizesamples
diff --git a/config/samples/tensor-fusion.ai_v1_gpunode.yaml b/config/samples/tensor-fusion.ai_v1_gpunode.yaml
new file mode 100644
index 0000000..0957bdb
--- /dev/null
+++ b/config/samples/tensor-fusion.ai_v1_gpunode.yaml
@@ -0,0 +1,9 @@
+apiVersion: tensor-fusion.ai.tensor-fusion.ai/v1
+kind: GPUNode
+metadata:
+  labels:
+    app.kubernetes.io/name: tensor-fusion-operator
+    app.kubernetes.io/managed-by: kustomize
+  name: gpunode-sample
+spec:
+  # TODO(user): Add fields here
diff --git a/config/samples/tensor-fusion.ai_v1_tensorfusionconnection.yaml b/config/samples/tensor-fusion.ai_v1_tensorfusionconnection.yaml
new file mode 100644
index 0000000..91c2a95
--- /dev/null
+++ b/config/samples/tensor-fusion.ai_v1_tensorfusionconnection.yaml
@@ -0,0 +1,9 @@
+apiVersion: tensor-fusion.ai.tensor-fusion.ai/v1
+kind: TensorFusionConnection
+metadata:
+  labels:
+    app.kubernetes.io/name: tensor-fusion-operator
+    app.kubernetes.io/managed-by: kustomize
+  name: tensorfusionconnection-sample
+spec:
+  # TODO(user): Add fields here
diff --git a/internal/controller/gpunode_controller.go b/internal/controller/gpunode_controller.go
new file mode 100644
index 0000000..70fbd9b
--- /dev/null
+++ b/internal/controller/gpunode_controller.go
@@ -0,0 +1,63 @@
+/*
+Copyright 2024.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package controller
+
+import (
+	"context"
+
+	"k8s.io/apimachinery/pkg/runtime"
+	ctrl "sigs.k8s.io/controller-runtime"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+	"sigs.k8s.io/controller-runtime/pkg/log"
+
+	tensorfusionaiv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
+)
+
+// GPUNodeReconciler reconciles a GPUNode object
+type GPUNodeReconciler struct {
+	client.Client
+	Scheme *runtime.Scheme
+}
+
+// +kubebuilder:rbac:groups=tensor-fusion.ai.tensor-fusion.ai,resources=gpunodes,verbs=get;list;watch;create;update;patch;delete
+// +kubebuilder:rbac:groups=tensor-fusion.ai.tensor-fusion.ai,resources=gpunodes/status,verbs=get;update;patch
+// +kubebuilder:rbac:groups=tensor-fusion.ai.tensor-fusion.ai,resources=gpunodes/finalizers,verbs=update
+
+// Reconcile is part of the main kubernetes reconciliation loop which aims to
+// move the current state of the cluster closer to the desired state.
+// TODO(user): Modify the Reconcile function to compare the state specified by
+// the GPUNode object against the actual cluster state, and then
+// perform operations to make the cluster state reflect the state specified by
+// the user.
+//
+// For more details, check Reconcile and its Result here:
+// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.19.1/pkg/reconcile
+func (r *GPUNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
+	_ = log.FromContext(ctx)
+
+	// TODO(user): your logic here
+
+	return ctrl.Result{}, nil
+}
+
+// SetupWithManager sets up the controller with the Manager.
+func (r *GPUNodeReconciler) SetupWithManager(mgr ctrl.Manager) error {
+	return ctrl.NewControllerManagedBy(mgr).
+		For(&tensorfusionaiv1.GPUNode{}).
+		Named("gpunode").
+		Complete(r)
+}
diff --git a/internal/controller/gpunode_controller_test.go b/internal/controller/gpunode_controller_test.go
new file mode 100644
index 0000000..8cf0c89
--- /dev/null
+++ b/internal/controller/gpunode_controller_test.go
@@ -0,0 +1,84 @@
+/*
+Copyright 2024.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package controller
+
+import (
+	"context"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+	"k8s.io/apimachinery/pkg/api/errors"
+	"k8s.io/apimachinery/pkg/types"
+	"sigs.k8s.io/controller-runtime/pkg/reconcile"
+
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+
+	tensorfusionaiv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
+)
+
+var _ = Describe("GPUNode Controller", func() {
+	Context("When reconciling a resource", func() {
+		const resourceName = "test-resource"
+
+		ctx := context.Background()
+
+		typeNamespacedName := types.NamespacedName{
+			Name:      resourceName,
+			Namespace: "default", // TODO(user):Modify as needed
+		}
+		gpunode := &tensorfusionaiv1.GPUNode{}
+
+		BeforeEach(func() {
+			By("creating the custom resource for the Kind GPUNode")
+			err := k8sClient.Get(ctx, typeNamespacedName, gpunode)
+			if err != nil && errors.IsNotFound(err) {
+				resource := &tensorfusionaiv1.GPUNode{
+					ObjectMeta: metav1.ObjectMeta{
+						Name:      resourceName,
+						Namespace: "default",
+					},
+					// TODO(user): Specify other spec details if needed.
+				}
+				Expect(k8sClient.Create(ctx, resource)).To(Succeed())
+			}
+		})
+
+		AfterEach(func() {
+			// TODO(user): Cleanup logic after each test, like removing the resource instance.
+			resource := &tensorfusionaiv1.GPUNode{}
+			err := k8sClient.Get(ctx, typeNamespacedName, resource)
+			Expect(err).NotTo(HaveOccurred())
+
+			By("Cleanup the specific resource instance GPUNode")
+			Expect(k8sClient.Delete(ctx, resource)).To(Succeed())
+		})
+		It("should successfully reconcile the resource", func() {
+			By("Reconciling the created resource")
+			controllerReconciler := &GPUNodeReconciler{
+				Client: k8sClient,
+				Scheme: k8sClient.Scheme(),
+			}
+
+			_, err := controllerReconciler.Reconcile(ctx, reconcile.Request{
+				NamespacedName: typeNamespacedName,
+			})
+			Expect(err).NotTo(HaveOccurred())
+			// TODO(user): Add more specific assertions depending on your controller's reconciliation logic.
+			// Example: If you expect a certain status condition after reconciliation, verify it here.
+		})
+	})
+})
diff --git a/internal/controller/suite_test.go b/internal/controller/suite_test.go
new file mode 100644
index 0000000..33f944a
--- /dev/null
+++ b/internal/controller/suite_test.go
@@ -0,0 +1,96 @@
+/*
+Copyright 2024.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package controller
+
+import (
+	"context"
+	"fmt"
+	"path/filepath"
+	"runtime"
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	"k8s.io/client-go/kubernetes/scheme"
+	"k8s.io/client-go/rest"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+	"sigs.k8s.io/controller-runtime/pkg/envtest"
+	logf "sigs.k8s.io/controller-runtime/pkg/log"
+	"sigs.k8s.io/controller-runtime/pkg/log/zap"
+
+	tensorfusionaiv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
+	// +kubebuilder:scaffold:imports
+)
+
+// These tests use Ginkgo (BDD-style Go testing framework). Refer to
+// http://onsi.github.io/ginkgo/ to learn more about Ginkgo.
+
+var cfg *rest.Config
+var k8sClient client.Client
+var testEnv *envtest.Environment
+var ctx context.Context
+var cancel context.CancelFunc
+
+func TestControllers(t *testing.T) {
+	RegisterFailHandler(Fail)
+
+	RunSpecs(t, "Controller Suite")
+}
+
+var _ = BeforeSuite(func() {
+	logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true)))
+
+	ctx, cancel = context.WithCancel(context.TODO())
+
+	By("bootstrapping test environment")
+	testEnv = &envtest.Environment{
+		CRDDirectoryPaths:     []string{filepath.Join("..", "..", "config", "crd", "bases")},
+		ErrorIfCRDPathMissing: true,
+
+		// The BinaryAssetsDirectory is only required if you want to run the tests directly
+		// without call the makefile target test. If not informed it will look for the
+		// default path defined in controller-runtime which is /usr/local/kubebuilder/.
+		// Note that you must have the required binaries setup under the bin directory to perform
+		// the tests directly. When we run make test it will be setup and used automatically.
+		BinaryAssetsDirectory: filepath.Join("..", "..", "bin", "k8s",
+			fmt.Sprintf("1.31.0-%s-%s", runtime.GOOS, runtime.GOARCH)),
+	}
+
+	var err error
+	// cfg is defined in this file globally.
+	cfg, err = testEnv.Start()
+	Expect(err).NotTo(HaveOccurred())
+	Expect(cfg).NotTo(BeNil())
+
+	err = tensorfusionaiv1.AddToScheme(scheme.Scheme)
+	Expect(err).NotTo(HaveOccurred())
+
+	// +kubebuilder:scaffold:scheme
+
+	k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme})
+	Expect(err).NotTo(HaveOccurred())
+	Expect(k8sClient).NotTo(BeNil())
+
+})
+
+var _ = AfterSuite(func() {
+	By("tearing down the test environment")
+	cancel()
+	err := testEnv.Stop()
+	Expect(err).NotTo(HaveOccurred())
+})
diff --git a/internal/controller/tensorfusionconnection_controller.go b/internal/controller/tensorfusionconnection_controller.go
new file mode 100644
index 0000000..84e6f50
--- /dev/null
+++ b/internal/controller/tensorfusionconnection_controller.go
@@ -0,0 +1,63 @@
+/*
+Copyright 2024.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package controller
+
+import (
+	"context"
+
+	"k8s.io/apimachinery/pkg/runtime"
+	ctrl "sigs.k8s.io/controller-runtime"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+	"sigs.k8s.io/controller-runtime/pkg/log"
+
+	tensorfusionaiv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
+)
+
+// TensorFusionConnectionReconciler reconciles a TensorFusionConnection object
+type TensorFusionConnectionReconciler struct {
+	client.Client
+	Scheme *runtime.Scheme
+}
+
+// +kubebuilder:rbac:groups=tensor-fusion.ai.tensor-fusion.ai,resources=tensorfusionconnections,verbs=get;list;watch;create;update;patch;delete
+// +kubebuilder:rbac:groups=tensor-fusion.ai.tensor-fusion.ai,resources=tensorfusionconnections/status,verbs=get;update;patch
+// +kubebuilder:rbac:groups=tensor-fusion.ai.tensor-fusion.ai,resources=tensorfusionconnections/finalizers,verbs=update
+
+// Reconcile is part of the main kubernetes reconciliation loop which aims to
+// move the current state of the cluster closer to the desired state.
+// TODO(user): Modify the Reconcile function to compare the state specified by
+// the TensorFusionConnection object against the actual cluster state, and then
+// perform operations to make the cluster state reflect the state specified by
+// the user.
+//
+// For more details, check Reconcile and its Result here:
+// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.19.1/pkg/reconcile
+func (r *TensorFusionConnectionReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
+	_ = log.FromContext(ctx)
+
+	// TODO(user): your logic here
+
+	return ctrl.Result{}, nil
+}
+
+// SetupWithManager sets up the controller with the Manager.
+func (r *TensorFusionConnectionReconciler) SetupWithManager(mgr ctrl.Manager) error {
+	return ctrl.NewControllerManagedBy(mgr).
+		For(&tensorfusionaiv1.TensorFusionConnection{}).
+		Named("tensorfusionconnection").
+		Complete(r)
+}
diff --git a/internal/controller/tensorfusionconnection_controller_test.go b/internal/controller/tensorfusionconnection_controller_test.go
new file mode 100644
index 0000000..6ad2872
--- /dev/null
+++ b/internal/controller/tensorfusionconnection_controller_test.go
@@ -0,0 +1,84 @@
+/*
+Copyright 2024.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package controller
+
+import (
+	"context"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+	"k8s.io/apimachinery/pkg/api/errors"
+	"k8s.io/apimachinery/pkg/types"
+	"sigs.k8s.io/controller-runtime/pkg/reconcile"
+
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+
+	tensorfusionaiv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
+)
+
+var _ = Describe("TensorFusionConnection Controller", func() {
+	Context("When reconciling a resource", func() {
+		const resourceName = "test-resource"
+
+		ctx := context.Background()
+
+		typeNamespacedName := types.NamespacedName{
+			Name:      resourceName,
+			Namespace: "default", // TODO(user):Modify as needed
+		}
+		tensorfusionconnection := &tensorfusionaiv1.TensorFusionConnection{}
+
+		BeforeEach(func() {
+			By("creating the custom resource for the Kind TensorFusionConnection")
+			err := k8sClient.Get(ctx, typeNamespacedName, tensorfusionconnection)
+			if err != nil && errors.IsNotFound(err) {
+				resource := &tensorfusionaiv1.TensorFusionConnection{
+					ObjectMeta: metav1.ObjectMeta{
+						Name:      resourceName,
+						Namespace: "default",
+					},
+					// TODO(user): Specify other spec details if needed.
+				}
+				Expect(k8sClient.Create(ctx, resource)).To(Succeed())
+			}
+		})
+
+		AfterEach(func() {
+			// TODO(user): Cleanup logic after each test, like removing the resource instance.
+			resource := &tensorfusionaiv1.TensorFusionConnection{}
+			err := k8sClient.Get(ctx, typeNamespacedName, resource)
+			Expect(err).NotTo(HaveOccurred())
+
+			By("Cleanup the specific resource instance TensorFusionConnection")
+			Expect(k8sClient.Delete(ctx, resource)).To(Succeed())
+		})
+		It("should successfully reconcile the resource", func() {
+			By("Reconciling the created resource")
+			controllerReconciler := &TensorFusionConnectionReconciler{
+				Client: k8sClient,
+				Scheme: k8sClient.Scheme(),
+			}
+
+			_, err := controllerReconciler.Reconcile(ctx, reconcile.Request{
+				NamespacedName: typeNamespacedName,
+			})
+			Expect(err).NotTo(HaveOccurred())
+			// TODO(user): Add more specific assertions depending on your controller's reconciliation logic.
+			// Example: If you expect a certain status condition after reconciliation, verify it here.
+		})
+	})
+})

From 08e8e570f79f6c8c00c5ff7dcab26d87f06b0207 Mon Sep 17 00:00:00 2001
From: 0x5457 <0x5457@protonmail.com>
Date: Tue, 3 Dec 2024 07:34:51 +0000
Subject: [PATCH 03/22] feat(scheduler): implement a simple first-fit GPU node
 scheduler

---
 api/v1/gpunode_types.go          |   4 +-
 api/v1/zz_generated.deepcopy.go  |   2 +-
 internal/scheduler/naive.go      |  58 +++++++++++++
 internal/scheduler/naive_test.go | 144 +++++++++++++++++++++++++++++++
 internal/scheduler/scheduler.go  |  20 +++++
 5 files changed, 225 insertions(+), 3 deletions(-)
 create mode 100644 internal/scheduler/naive.go
 create mode 100644 internal/scheduler/naive_test.go
 create mode 100644 internal/scheduler/scheduler.go

diff --git a/api/v1/gpunode_types.go b/api/v1/gpunode_types.go
index 45524cd..24fe843 100644
--- a/api/v1/gpunode_types.go
+++ b/api/v1/gpunode_types.go
@@ -22,8 +22,8 @@ import (
 
 // GPUNodeStatus defines the observed state of GPUNode.
 type GPUNodeStatus struct {
-	Capacity Resource `json:"capacity"`
-	Used     Resource `json:"used"`
+	Capacity  Resource `json:"capacity"`
+	Available Resource `json:"available"`
 }
 
 // +kubebuilder:object:root=true
diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go
index 841b343..28bd614 100644
--- a/api/v1/zz_generated.deepcopy.go
+++ b/api/v1/zz_generated.deepcopy.go
@@ -86,7 +86,7 @@ func (in *GPUNodeList) DeepCopyObject() runtime.Object {
 func (in *GPUNodeStatus) DeepCopyInto(out *GPUNodeStatus) {
 	*out = *in
 	in.Capacity.DeepCopyInto(&out.Capacity)
-	in.Used.DeepCopyInto(&out.Used)
+	in.Available.DeepCopyInto(&out.Available)
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeStatus.
diff --git a/internal/scheduler/naive.go b/internal/scheduler/naive.go
new file mode 100644
index 0000000..24f95ff
--- /dev/null
+++ b/internal/scheduler/naive.go
@@ -0,0 +1,58 @@
+package scheduler
+
+import (
+	"fmt"
+	"sync"
+
+	v1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
+)
+
+// NaiveScheduler implements a simple scheduling strategy
+type NaiveScheduler struct {
+	sync.RWMutex
+	nodes map[string]*v1.GPUNode
+}
+
+// NewNaiveScheduler creates a new NaiveScheduler
+func NewNaiveScheduler() *NaiveScheduler {
+	return &NaiveScheduler{
+		nodes: make(map[string]*v1.GPUNode),
+	}
+}
+
+// Schedule implements Scheduler interface
+func (s *NaiveScheduler) Schedule(request v1.Resource) (*v1.GPUNode, error) {
+	s.RLock()
+	defer s.RUnlock()
+
+	// Simple strategy: return the first node that has enough resources
+	for _, node := range s.nodes {
+		if node.Status.Available.Tflops.Cmp(request.Tflops) >= 0 &&
+			node.Status.Available.Vram.Cmp(request.Vram) >= 0 {
+			return node, nil
+		}
+	}
+
+	return nil, fmt.Errorf("no suitable node found for request: %v", request)
+}
+
+// OnAdd implements Scheduler interface
+func (s *NaiveScheduler) OnAdd(node *v1.GPUNode) {
+	s.Lock()
+	defer s.Unlock()
+	s.nodes[node.Name] = node
+}
+
+// OnUpdate implements Scheduler interface
+func (s *NaiveScheduler) OnUpdate(oldNode, newNode *v1.GPUNode) {
+	s.Lock()
+	defer s.Unlock()
+	s.nodes[newNode.Name] = newNode
+}
+
+// OnDelete implements Scheduler interface
+func (s *NaiveScheduler) OnDelete(node *v1.GPUNode) {
+	s.Lock()
+	defer s.Unlock()
+	delete(s.nodes, node.Name)
+}
diff --git a/internal/scheduler/naive_test.go b/internal/scheduler/naive_test.go
new file mode 100644
index 0000000..eac7740
--- /dev/null
+++ b/internal/scheduler/naive_test.go
@@ -0,0 +1,144 @@
+package scheduler
+
+import (
+	"testing"
+
+	v1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+)
+
+func createGPUNode(name string, tflops, vram string) *v1.GPUNode {
+	return &v1.GPUNode{
+		ObjectMeta: metav1.ObjectMeta{
+			Name: name,
+		},
+		Status: v1.GPUNodeStatus{
+			Available: v1.Resource{
+				Tflops: resource.MustParse(tflops),
+				Vram:   resource.MustParse(vram),
+			},
+		},
+	}
+}
+
+func createRequest(tflops, vram string) v1.Resource {
+	return v1.Resource{
+		Tflops: resource.MustParse(tflops),
+		Vram:   resource.MustParse(vram),
+	}
+}
+
+func TestNaiveScheduler_Schedule(t *testing.T) {
+	tests := []struct {
+		name      string
+		nodes     []*v1.GPUNode
+		request   v1.Resource
+		wantNode  string
+		wantError bool
+	}{
+		{
+			name: "simple match",
+			nodes: []*v1.GPUNode{
+				createGPUNode("node1", "100", "16Gi"),
+			},
+			request:   createRequest("50", "8Gi"),
+			wantNode:  "node1",
+			wantError: false,
+		},
+		{
+			name: "no nodes",
+			nodes: []*v1.GPUNode{},
+			request:   createRequest("50", "8Gi"),
+			wantNode:  "",
+			wantError: true,
+		},
+		{
+			name: "insufficient resources",
+			nodes: []*v1.GPUNode{
+				createGPUNode("node1", "40", "16Gi"),
+			},
+			request:   createRequest("50", "8Gi"),
+			wantNode:  "",
+			wantError: true,
+		},
+		{
+			name: "multiple nodes, first fit",
+			nodes: []*v1.GPUNode{
+				createGPUNode("node1", "40", "16Gi"),
+				createGPUNode("node2", "100", "32Gi"),
+			},
+			request:   createRequest("50", "8Gi"),
+			wantNode:  "node2",
+			wantError: false,
+		},
+		{
+			name: "exact match",
+			nodes: []*v1.GPUNode{
+				createGPUNode("node1", "50", "8Gi"),
+			},
+			request:   createRequest("50", "8Gi"),
+			wantNode:  "node1",
+			wantError: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			s := NewNaiveScheduler()
+			
+			// Add nodes
+			for _, node := range tt.nodes {
+				s.OnAdd(node)
+			}
+
+			// Try to schedule
+			got, err := s.Schedule(tt.request)
+			
+			// Check error
+			if (err != nil) != tt.wantError {
+				t.Errorf("Schedule() error = %v, wantError %v", err, tt.wantError)
+				return
+			}
+
+			// Check result
+			if !tt.wantError {
+				if got == nil {
+					t.Error("Schedule() returned nil node when error not expected")
+					return
+				}
+				if got.Name != tt.wantNode {
+					t.Errorf("Schedule() got node = %v, want %v", got.Name, tt.wantNode)
+				}
+			}
+		})
+	}
+}
+
+func TestNaiveScheduler_NodeOperations(t *testing.T) {
+	s := NewNaiveScheduler()
+	node1 := createGPUNode("node1", "100", "16Gi")
+	request := createRequest("50", "8Gi")
+
+	// Test OnAdd
+	s.OnAdd(node1)
+	got, err := s.Schedule(request)
+	if err != nil || got.Name != "node1" {
+		t.Errorf("After OnAdd: Schedule() got = %v, want node1", got)
+	}
+
+	// Test OnUpdate
+	node1Updated := createGPUNode("node1", "40", "16Gi")
+	s.OnUpdate(node1, node1Updated)
+	got, err = s.Schedule(request)
+	if err == nil {
+		t.Error("After OnUpdate: Schedule() should fail with insufficient resources")
+	}
+
+	// Test OnDelete
+	s.OnDelete(node1Updated)
+	got, err = s.Schedule(request)
+	if err == nil {
+		t.Error("After OnDelete: Schedule() should fail with no nodes")
+	}
+}
diff --git a/internal/scheduler/scheduler.go b/internal/scheduler/scheduler.go
new file mode 100644
index 0000000..6f9d9b5
--- /dev/null
+++ b/internal/scheduler/scheduler.go
@@ -0,0 +1,20 @@
+package scheduler
+
+import (
+	v1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
+)
+
+// Scheduler is the interface that wraps the scheduling methods
+type Scheduler interface {
+	// Schedule takes a Resource Request and returns the pointer of the GPU node
+	// that can accommodate the request. If no suitable node is found, it returns
+	// an nil pointer and an error.
+	Schedule(request v1.Resource) (*v1.GPUNode, error)
+
+	// OnAdd is called when a new node is added
+	OnAdd(node *v1.GPUNode)
+	// OnUpdate is called when a node is modified
+	OnUpdate(oldNode, newNode *v1.GPUNode)
+	// OnDelete is called when a node is deleted
+	OnDelete(node *v1.GPUNode)
+}

From f60c82840e01f9d852e24a32810badb20a434de2 Mon Sep 17 00:00:00 2001
From: 0x5457 <0x5457@protonmail.com>
Date: Tue, 3 Dec 2024 09:55:30 +0000
Subject: [PATCH 04/22] feat: implement gpunode and tensorfusionconnection
 controller

---
 api/v1/gpunode_types.go                       |  1 +
 api/v1/zz_generated.deepcopy.go               |  5 +
 cmd/main.go                                   |  4 +-
 ...r-fusion.ai.tensor-fusion.ai_gpunodes.yaml | 11 ++-
 internal/controller/gpunode_controller.go     | 52 +++++++----
 .../tensorfusionconnection_controller.go      | 92 ++++++++++++++++---
 internal/scheduler/naive.go                   | 24 ++---
 internal/scheduler/naive_test.go              | 84 ++++++++++-------
 internal/scheduler/scheduler.go               | 10 +-
 internal/worker/worker.go                     |  9 ++
 10 files changed, 213 insertions(+), 79 deletions(-)
 create mode 100644 internal/worker/worker.go

diff --git a/api/v1/gpunode_types.go b/api/v1/gpunode_types.go
index 24fe843..6a752ba 100644
--- a/api/v1/gpunode_types.go
+++ b/api/v1/gpunode_types.go
@@ -24,6 +24,7 @@ import (
 type GPUNodeStatus struct {
 	Capacity  Resource `json:"capacity"`
 	Available Resource `json:"available"`
+	Devices   []string `json:"devices"`
 }
 
 // +kubebuilder:object:root=true
diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go
index 28bd614..d899ad7 100644
--- a/api/v1/zz_generated.deepcopy.go
+++ b/api/v1/zz_generated.deepcopy.go
@@ -87,6 +87,11 @@ func (in *GPUNodeStatus) DeepCopyInto(out *GPUNodeStatus) {
 	*out = *in
 	in.Capacity.DeepCopyInto(&out.Capacity)
 	in.Available.DeepCopyInto(&out.Available)
+	if in.Devices != nil {
+		in, out := &in.Devices, &out.Devices
+		*out = make([]string, len(*in))
+		copy(*out, *in)
+	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeStatus.
diff --git a/cmd/main.go b/cmd/main.go
index 4d3856e..ac14a23 100644
--- a/cmd/main.go
+++ b/cmd/main.go
@@ -17,6 +17,7 @@ limitations under the License.
 package main
 
 import (
+	"context"
 	"crypto/tls"
 	"flag"
 	"os"
@@ -142,6 +143,7 @@ func main() {
 		os.Exit(1)
 	}
 
+	ctx := context.Background()
 	if err = (&controller.TensorFusionConnectionReconciler{
 		Client: mgr.GetClient(),
 		Scheme: mgr.GetScheme(),
@@ -152,7 +154,7 @@ func main() {
 	if err = (&controller.GPUNodeReconciler{
 		Client: mgr.GetClient(),
 		Scheme: mgr.GetScheme(),
-	}).SetupWithManager(mgr); err != nil {
+	}).SetupWithManager(ctx, mgr); err != nil {
 		setupLog.Error(err, "unable to create controller", "controller", "GPUNode")
 		os.Exit(1)
 	}
diff --git a/config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_gpunodes.yaml b/config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_gpunodes.yaml
index b3a3b46..4829b1f 100644
--- a/config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_gpunodes.yaml
+++ b/config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_gpunodes.yaml
@@ -39,7 +39,7 @@ spec:
           status:
             description: GPUNodeStatus defines the observed state of GPUNode.
             properties:
-              capacity:
+              available:
                 properties:
                   tflops:
                     anyOf:
@@ -57,7 +57,7 @@ spec:
                 - tflops
                 - vram
                 type: object
-              used:
+              capacity:
                 properties:
                   tflops:
                     anyOf:
@@ -75,9 +75,14 @@ spec:
                 - tflops
                 - vram
                 type: object
+              devices:
+                items:
+                  type: string
+                type: array
             required:
+            - available
             - capacity
-            - used
+            - devices
             type: object
         type: object
     served: true
diff --git a/internal/controller/gpunode_controller.go b/internal/controller/gpunode_controller.go
index 70fbd9b..265f840 100644
--- a/internal/controller/gpunode_controller.go
+++ b/internal/controller/gpunode_controller.go
@@ -22,15 +22,18 @@ import (
 	"k8s.io/apimachinery/pkg/runtime"
 	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/client"
-	"sigs.k8s.io/controller-runtime/pkg/log"
+	"sigs.k8s.io/controller-runtime/pkg/event"
+	"sigs.k8s.io/controller-runtime/pkg/predicate"
 
-	tensorfusionaiv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
+	tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
+	scheduler "github.com/NexusGPU/tensor-fusion-operator/internal/scheduler"
 )
 
 // GPUNodeReconciler reconciles a GPUNode object
 type GPUNodeReconciler struct {
 	client.Client
-	Scheme *runtime.Scheme
+	Scheme    *runtime.Scheme
+	Scheduler scheduler.Scheduler
 }
 
 // +kubebuilder:rbac:groups=tensor-fusion.ai.tensor-fusion.ai,resources=gpunodes,verbs=get;list;watch;create;update;patch;delete
@@ -39,25 +42,42 @@ type GPUNodeReconciler struct {
 
 // Reconcile is part of the main kubernetes reconciliation loop which aims to
 // move the current state of the cluster closer to the desired state.
-// TODO(user): Modify the Reconcile function to compare the state specified by
-// the GPUNode object against the actual cluster state, and then
-// perform operations to make the cluster state reflect the state specified by
-// the user.
-//
-// For more details, check Reconcile and its Result here:
-// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.19.1/pkg/reconcile
 func (r *GPUNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
-	_ = log.FromContext(ctx)
-
-	// TODO(user): your logic here
-
+	// TOOD: Calculate tflops and update capacity here
 	return ctrl.Result{}, nil
 }
 
 // SetupWithManager sets up the controller with the Manager.
-func (r *GPUNodeReconciler) SetupWithManager(mgr ctrl.Manager) error {
+func (r *GPUNodeReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager) error {
+	// List all existing GPUNodes and add them to scheduler
+	existingNodes := &tfv1.GPUNodeList{}
+	if err := r.List(ctx, existingNodes); err != nil {
+		return err
+	}
+
+	// Add all existing nodes to scheduler
+	for i := range existingNodes.Items {
+		r.Scheduler.OnAdd(&existingNodes.Items[i])
+	}
+
 	return ctrl.NewControllerManagedBy(mgr).
-		For(&tensorfusionaiv1.GPUNode{}).
+		For(&tfv1.GPUNode{}).
 		Named("gpunode").
+		WithEventFilter(
+			predicate.Funcs{
+				CreateFunc: func(e event.CreateEvent) bool {
+					r.Scheduler.OnAdd(e.Object.(*tfv1.GPUNode))
+					return true
+				},
+				UpdateFunc: func(e event.UpdateEvent) bool {
+					r.Scheduler.OnUpdate(e.ObjectOld.(*tfv1.GPUNode), e.ObjectNew.(*tfv1.GPUNode))
+					return true
+				},
+				DeleteFunc: func(e event.DeleteEvent) bool {
+					r.Scheduler.OnDelete(e.Object.(*tfv1.GPUNode))
+					return true
+				},
+			},
+		).
 		Complete(r)
 }
diff --git a/internal/controller/tensorfusionconnection_controller.go b/internal/controller/tensorfusionconnection_controller.go
index 84e6f50..b9223f4 100644
--- a/internal/controller/tensorfusionconnection_controller.go
+++ b/internal/controller/tensorfusionconnection_controller.go
@@ -19,18 +19,23 @@ package controller
 import (
 	"context"
 
+	"k8s.io/apimachinery/pkg/api/errors"
 	"k8s.io/apimachinery/pkg/runtime"
+	"k8s.io/client-go/util/retry"
 	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/log"
 
-	tensorfusionaiv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
+	tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
+	scheduler "github.com/NexusGPU/tensor-fusion-operator/internal/scheduler"
+	"github.com/NexusGPU/tensor-fusion-operator/internal/worker"
 )
 
 // TensorFusionConnectionReconciler reconciles a TensorFusionConnection object
 type TensorFusionConnectionReconciler struct {
 	client.Client
-	Scheme *runtime.Scheme
+	Scheme    *runtime.Scheme
+	Scheduler scheduler.Scheduler
 }
 
 // +kubebuilder:rbac:groups=tensor-fusion.ai.tensor-fusion.ai,resources=tensorfusionconnections,verbs=get;list;watch;create;update;patch;delete
@@ -39,25 +44,88 @@ type TensorFusionConnectionReconciler struct {
 
 // Reconcile is part of the main kubernetes reconciliation loop which aims to
 // move the current state of the cluster closer to the desired state.
-// TODO(user): Modify the Reconcile function to compare the state specified by
-// the TensorFusionConnection object against the actual cluster state, and then
-// perform operations to make the cluster state reflect the state specified by
-// the user.
-//
-// For more details, check Reconcile and its Result here:
-// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.19.1/pkg/reconcile
 func (r *TensorFusionConnectionReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
-	_ = log.FromContext(ctx)
+	log := log.FromContext(ctx)
 
-	// TODO(user): your logic here
+	// Get the TensorFusionConnection object
+	connection := &tfv1.TensorFusionConnection{}
+	if err := r.Get(ctx, req.NamespacedName, connection); err != nil {
+		if errors.IsNotFound(err) {
+			// Object not found, could have been deleted after reconcile request, return without error
+			return ctrl.Result{}, nil
+		}
+		log.Error(err, "Failed to get TensorFusionConnection")
+		return ctrl.Result{}, err
+	}
+
+	var node *tfv1.GPUNode
+	// If status is not set or pending, try to schedule
+	if connection.Status.Phase == "" || connection.Status.Phase == tfv1.TensorFusionConnectionPending {
+		// Try to get an available node from scheduler
+		node, err := r.Scheduler.Schedule(connection.Spec.Resources.Request)
+		if err != nil {
+			log.Error(err, "Failed to schedule connection")
+			connection.Status.Phase = tfv1.TensorFusionConnectionPending
+		} else if node != nil {
+			connection.Status.Phase = tfv1.TensorFusionConnectionRunning
+			connection.Status.ConnectionURL = worker.GenerateConnectionURL(node, connection)
+		} else {
+			connection.Status.Phase = tfv1.TensorFusionConnectionPending
+		}
+	}
+
+	if err := r.MustUpdateStatus(ctx, connection, node); err != nil {
+		return ctrl.Result{}, err
+	}
 
 	return ctrl.Result{}, nil
 }
 
+func (r *TensorFusionConnectionReconciler) MustUpdateStatus(ctx context.Context, connection *tfv1.TensorFusionConnection, gpuNode *tfv1.GPUNode) error {
+	return retry.RetryOnConflict(retry.DefaultBackoff, func() error {
+		// Get the latest version of the connection
+		latestConnection := &tfv1.TensorFusionConnection{}
+		if err := r.Get(ctx, client.ObjectKey{
+			Name:      connection.Name,
+			Namespace: connection.Namespace,
+		}, latestConnection); err != nil {
+			return err
+		}
+
+		// Update the status fields we care about
+		latestConnection.Status.Phase = connection.Status.Phase
+		latestConnection.Status.ConnectionURL = connection.Status.ConnectionURL
+
+		// Update the connection status
+		if err := r.Status().Update(ctx, latestConnection); err != nil {
+			return err
+		}
+
+		if gpuNode != nil {
+			// Get the latest version of the node
+			latestNode := &tfv1.GPUNode{}
+
+			if err := r.Get(ctx, client.ObjectKey{
+				Name:      gpuNode.Name,
+				Namespace: gpuNode.Namespace,
+			}, latestNode); err != nil {
+				return err
+			}
+
+			// Update the status fields we care about
+			latestNode.Status.Available = gpuNode.Status.Available
+			if err := r.Status().Update(ctx, latestNode); err != nil {
+				return err
+			}
+		}
+		return nil
+	})
+}
+
 // SetupWithManager sets up the controller with the Manager.
 func (r *TensorFusionConnectionReconciler) SetupWithManager(mgr ctrl.Manager) error {
 	return ctrl.NewControllerManagedBy(mgr).
-		For(&tensorfusionaiv1.TensorFusionConnection{}).
+		For(&tfv1.TensorFusionConnection{}).
 		Named("tensorfusionconnection").
 		Complete(r)
 }
diff --git a/internal/scheduler/naive.go b/internal/scheduler/naive.go
index 24f95ff..8d9a4d3 100644
--- a/internal/scheduler/naive.go
+++ b/internal/scheduler/naive.go
@@ -4,54 +4,56 @@ import (
 	"fmt"
 	"sync"
 
-	v1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
+	tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
 )
 
 // NaiveScheduler implements a simple scheduling strategy
 type NaiveScheduler struct {
-	sync.RWMutex
-	nodes map[string]*v1.GPUNode
+	sync.Mutex
+	nodes map[string]*tfv1.GPUNode
 }
 
 // NewNaiveScheduler creates a new NaiveScheduler
 func NewNaiveScheduler() *NaiveScheduler {
 	return &NaiveScheduler{
-		nodes: make(map[string]*v1.GPUNode),
+		nodes: make(map[string]*tfv1.GPUNode),
 	}
 }
 
 // Schedule implements Scheduler interface
-func (s *NaiveScheduler) Schedule(request v1.Resource) (*v1.GPUNode, error) {
-	s.RLock()
-	defer s.RUnlock()
+func (s *NaiveScheduler) Schedule(request tfv1.Resource) (*tfv1.GPUNode, error) {
+	s.Lock()
+	defer s.Unlock()
 
 	// Simple strategy: return the first node that has enough resources
 	for _, node := range s.nodes {
 		if node.Status.Available.Tflops.Cmp(request.Tflops) >= 0 &&
 			node.Status.Available.Vram.Cmp(request.Vram) >= 0 {
+			// Update the node's available resources
+			node.Status.Available.Tflops.Sub(request.Tflops)
+			node.Status.Available.Vram.Sub(request.Vram)
 			return node, nil
 		}
 	}
-
 	return nil, fmt.Errorf("no suitable node found for request: %v", request)
 }
 
 // OnAdd implements Scheduler interface
-func (s *NaiveScheduler) OnAdd(node *v1.GPUNode) {
+func (s *NaiveScheduler) OnAdd(node *tfv1.GPUNode) {
 	s.Lock()
 	defer s.Unlock()
 	s.nodes[node.Name] = node
 }
 
 // OnUpdate implements Scheduler interface
-func (s *NaiveScheduler) OnUpdate(oldNode, newNode *v1.GPUNode) {
+func (s *NaiveScheduler) OnUpdate(oldNode, newNode *tfv1.GPUNode) {
 	s.Lock()
 	defer s.Unlock()
 	s.nodes[newNode.Name] = newNode
 }
 
 // OnDelete implements Scheduler interface
-func (s *NaiveScheduler) OnDelete(node *v1.GPUNode) {
+func (s *NaiveScheduler) OnDelete(node *tfv1.GPUNode) {
 	s.Lock()
 	defer s.Unlock()
 	delete(s.nodes, node.Name)
diff --git a/internal/scheduler/naive_test.go b/internal/scheduler/naive_test.go
index eac7740..be1affd 100644
--- a/internal/scheduler/naive_test.go
+++ b/internal/scheduler/naive_test.go
@@ -3,18 +3,18 @@ package scheduler
 import (
 	"testing"
 
-	v1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
+	tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 )
 
-func createGPUNode(name string, tflops, vram string) *v1.GPUNode {
-	return &v1.GPUNode{
+func createGPUNode(name string, tflops, vram string) *tfv1.GPUNode {
+	return &tfv1.GPUNode{
 		ObjectMeta: metav1.ObjectMeta{
 			Name: name,
 		},
-		Status: v1.GPUNodeStatus{
-			Available: v1.Resource{
+		Status: tfv1.GPUNodeStatus{
+			Available: tfv1.Resource{
 				Tflops: resource.MustParse(tflops),
 				Vram:   resource.MustParse(vram),
 			},
@@ -22,8 +22,8 @@ func createGPUNode(name string, tflops, vram string) *v1.GPUNode {
 	}
 }
 
-func createRequest(tflops, vram string) v1.Resource {
-	return v1.Resource{
+func createRequest(tflops, vram string) tfv1.Resource {
+	return tfv1.Resource{
 		Tflops: resource.MustParse(tflops),
 		Vram:   resource.MustParse(vram),
 	}
@@ -31,31 +31,35 @@ func createRequest(tflops, vram string) v1.Resource {
 
 func TestNaiveScheduler_Schedule(t *testing.T) {
 	tests := []struct {
-		name      string
-		nodes     []*v1.GPUNode
-		request   v1.Resource
-		wantNode  string
-		wantError bool
+		name                string
+		nodes               []*tfv1.GPUNode
+		request             tfv1.Resource
+		wantNode            string
+		wantError           bool
+		wantRemainingTflops string
+		wantRemainingVram   string
 	}{
 		{
 			name: "simple match",
-			nodes: []*v1.GPUNode{
+			nodes: []*tfv1.GPUNode{
 				createGPUNode("node1", "100", "16Gi"),
 			},
-			request:   createRequest("50", "8Gi"),
-			wantNode:  "node1",
-			wantError: false,
+			request:             createRequest("50", "8Gi"),
+			wantNode:            "node1",
+			wantError:           false,
+			wantRemainingTflops: "50",
+			wantRemainingVram:   "8Gi",
 		},
 		{
-			name: "no nodes",
-			nodes: []*v1.GPUNode{},
+			name:      "no nodes",
+			nodes:     []*tfv1.GPUNode{},
 			request:   createRequest("50", "8Gi"),
 			wantNode:  "",
 			wantError: true,
 		},
 		{
 			name: "insufficient resources",
-			nodes: []*v1.GPUNode{
+			nodes: []*tfv1.GPUNode{
 				createGPUNode("node1", "40", "16Gi"),
 			},
 			request:   createRequest("50", "8Gi"),
@@ -64,29 +68,33 @@ func TestNaiveScheduler_Schedule(t *testing.T) {
 		},
 		{
 			name: "multiple nodes, first fit",
-			nodes: []*v1.GPUNode{
+			nodes: []*tfv1.GPUNode{
 				createGPUNode("node1", "40", "16Gi"),
 				createGPUNode("node2", "100", "32Gi"),
 			},
-			request:   createRequest("50", "8Gi"),
-			wantNode:  "node2",
-			wantError: false,
+			request:             createRequest("50", "8Gi"),
+			wantNode:            "node2",
+			wantError:           false,
+			wantRemainingTflops: "50",
+			wantRemainingVram:   "24Gi",
 		},
 		{
 			name: "exact match",
-			nodes: []*v1.GPUNode{
+			nodes: []*tfv1.GPUNode{
 				createGPUNode("node1", "50", "8Gi"),
 			},
-			request:   createRequest("50", "8Gi"),
-			wantNode:  "node1",
-			wantError: false,
+			request:             createRequest("50", "8Gi"),
+			wantNode:            "node1",
+			wantError:           false,
+			wantRemainingTflops: "0",
+			wantRemainingVram:   "0",
 		},
 	}
 
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			s := NewNaiveScheduler()
-			
+
 			// Add nodes
 			for _, node := range tt.nodes {
 				s.OnAdd(node)
@@ -94,7 +102,7 @@ func TestNaiveScheduler_Schedule(t *testing.T) {
 
 			// Try to schedule
 			got, err := s.Schedule(tt.request)
-			
+
 			// Check error
 			if (err != nil) != tt.wantError {
 				t.Errorf("Schedule() error = %v, wantError %v", err, tt.wantError)
@@ -110,6 +118,20 @@ func TestNaiveScheduler_Schedule(t *testing.T) {
 				if got.Name != tt.wantNode {
 					t.Errorf("Schedule() got node = %v, want %v", got.Name, tt.wantNode)
 				}
+
+				// Check remaining resources
+				if tt.wantRemainingTflops != "" {
+					wantTflops := resource.MustParse(tt.wantRemainingTflops)
+					if got.Status.Available.Tflops.Cmp(wantTflops) != 0 {
+						t.Errorf("Remaining Tflops = %v, want %v", got.Status.Available.Tflops.String(), tt.wantRemainingTflops)
+					}
+				}
+				if tt.wantRemainingVram != "" {
+					wantVram := resource.MustParse(tt.wantRemainingVram)
+					if got.Status.Available.Vram.Cmp(wantVram) != 0 {
+						t.Errorf("Remaining Vram = %v, want %v", got.Status.Available.Vram.String(), tt.wantRemainingVram)
+					}
+				}
 			}
 		})
 	}
@@ -130,14 +152,14 @@ func TestNaiveScheduler_NodeOperations(t *testing.T) {
 	// Test OnUpdate
 	node1Updated := createGPUNode("node1", "40", "16Gi")
 	s.OnUpdate(node1, node1Updated)
-	got, err = s.Schedule(request)
+	_, err = s.Schedule(request)
 	if err == nil {
 		t.Error("After OnUpdate: Schedule() should fail with insufficient resources")
 	}
 
 	// Test OnDelete
 	s.OnDelete(node1Updated)
-	got, err = s.Schedule(request)
+	_, err = s.Schedule(request)
 	if err == nil {
 		t.Error("After OnDelete: Schedule() should fail with no nodes")
 	}
diff --git a/internal/scheduler/scheduler.go b/internal/scheduler/scheduler.go
index 6f9d9b5..47ac776 100644
--- a/internal/scheduler/scheduler.go
+++ b/internal/scheduler/scheduler.go
@@ -1,7 +1,7 @@
 package scheduler
 
 import (
-	v1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
+	tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
 )
 
 // Scheduler is the interface that wraps the scheduling methods
@@ -9,12 +9,12 @@ type Scheduler interface {
 	// Schedule takes a Resource Request and returns the pointer of the GPU node
 	// that can accommodate the request. If no suitable node is found, it returns
 	// an nil pointer and an error.
-	Schedule(request v1.Resource) (*v1.GPUNode, error)
+	Schedule(request tfv1.Resource) (*tfv1.GPUNode, error)
 
 	// OnAdd is called when a new node is added
-	OnAdd(node *v1.GPUNode)
+	OnAdd(node *tfv1.GPUNode)
 	// OnUpdate is called when a node is modified
-	OnUpdate(oldNode, newNode *v1.GPUNode)
+	OnUpdate(oldNode, newNode *tfv1.GPUNode)
 	// OnDelete is called when a node is deleted
-	OnDelete(node *v1.GPUNode)
+	OnDelete(node *tfv1.GPUNode)
 }
diff --git a/internal/worker/worker.go b/internal/worker/worker.go
new file mode 100644
index 0000000..74b93d3
--- /dev/null
+++ b/internal/worker/worker.go
@@ -0,0 +1,9 @@
+package worker
+
+import (
+	tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
+)
+
+func GenerateConnectionURL(_node *tfv1.GPUNode, _connection *tfv1.TensorFusionConnection) string {
+	return "TODO://"
+}

From 58e3406bd597c9bb7c34e2834a8c442069eb9a9f Mon Sep 17 00:00:00 2001
From: 0x5457 <0x5457@protonmail.com>
Date: Tue, 3 Dec 2024 10:57:30 +0000
Subject: [PATCH 05/22] feat: add HTTP server with connection router

---
 cmd/main.go                          |  26 ++++-
 go.mod                               |  20 ++++
 go.sum                               |  52 ++++++++++
 internal/server/router/connection.go | 145 +++++++++++++++++++++++++++
 internal/server/server.go            |  19 ++++
 5 files changed, 261 insertions(+), 1 deletion(-)
 create mode 100644 internal/server/router/connection.go
 create mode 100644 internal/server/server.go

diff --git a/cmd/main.go b/cmd/main.go
index ac14a23..d2d6e85 100644
--- a/cmd/main.go
+++ b/cmd/main.go
@@ -30,6 +30,7 @@ import (
 	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
 	clientgoscheme "k8s.io/client-go/kubernetes/scheme"
 	ctrl "sigs.k8s.io/controller-runtime"
+	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/healthz"
 	"sigs.k8s.io/controller-runtime/pkg/log/zap"
 	"sigs.k8s.io/controller-runtime/pkg/metrics/filters"
@@ -38,6 +39,8 @@ import (
 
 	tensorfusionaiv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
 	"github.com/NexusGPU/tensor-fusion-operator/internal/controller"
+	"github.com/NexusGPU/tensor-fusion-operator/internal/server"
+	"github.com/NexusGPU/tensor-fusion-operator/internal/server/router"
 	// +kubebuilder:scaffold:imports
 )
 
@@ -119,7 +122,8 @@ func main() {
 		// this setup is not recommended for production.
 	}
 
-	mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{
+	kc := ctrl.GetConfigOrDie()
+	mgr, err := ctrl.NewManager(kc, ctrl.Options{
 		Scheme:                 scheme,
 		Metrics:                metricsServerOptions,
 		WebhookServer:          webhookServer,
@@ -169,6 +173,26 @@ func main() {
 		os.Exit(1)
 	}
 
+	// Initialize and start the HTTP server
+	client, err := client.NewWithWatch(kc, client.Options{Scheme: scheme})
+	if err != nil {
+		setupLog.Error(err, "failed to create client with watch")
+		os.Exit(1)
+	}
+	connectionRouter, err := router.NewConnectionRouter(ctx, client)
+	if err != nil {
+		setupLog.Error(err, "failed to create connection router")
+		os.Exit(1)
+	}
+	httpServer := server.NewHTTPServer(connectionRouter)
+	go func() {
+		err := httpServer.Run()
+		if err != nil {
+			setupLog.Error(err, "problem running HTTP server")
+			os.Exit(1)
+		}
+	}()
+
 	setupLog.Info("starting manager")
 	if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil {
 		setupLog.Error(err, "problem running manager")
diff --git a/go.mod b/go.mod
index c5f3936..0b333c1 100644
--- a/go.mod
+++ b/go.mod
@@ -3,6 +3,8 @@ module github.com/NexusGPU/tensor-fusion-operator
 go 1.22.0
 
 require (
+	github.com/gin-contrib/gzip v1.0.1
+	github.com/gin-gonic/gin v1.10.0
 	github.com/onsi/ginkgo/v2 v2.19.0
 	github.com/onsi/gomega v1.33.1
 	k8s.io/apimachinery v0.31.0
@@ -15,21 +17,31 @@ require (
 	github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a // indirect
 	github.com/beorn7/perks v1.0.1 // indirect
 	github.com/blang/semver/v4 v4.0.0 // indirect
+	github.com/bytedance/sonic v1.11.6 // indirect
+	github.com/bytedance/sonic/loader v0.1.1 // indirect
 	github.com/cenkalti/backoff/v4 v4.3.0 // indirect
 	github.com/cespare/xxhash/v2 v2.3.0 // indirect
+	github.com/cloudwego/base64x v0.1.4 // indirect
+	github.com/cloudwego/iasm v0.2.0 // indirect
 	github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
 	github.com/emicklei/go-restful/v3 v3.11.0 // indirect
 	github.com/evanphx/json-patch/v5 v5.9.0 // indirect
 	github.com/felixge/httpsnoop v1.0.4 // indirect
 	github.com/fsnotify/fsnotify v1.7.0 // indirect
 	github.com/fxamacker/cbor/v2 v2.7.0 // indirect
+	github.com/gabriel-vasile/mimetype v1.4.3 // indirect
+	github.com/gin-contrib/sse v0.1.0 // indirect
 	github.com/go-logr/logr v1.4.2 // indirect
 	github.com/go-logr/stdr v1.2.2 // indirect
 	github.com/go-logr/zapr v1.3.0 // indirect
 	github.com/go-openapi/jsonpointer v0.19.6 // indirect
 	github.com/go-openapi/jsonreference v0.20.2 // indirect
 	github.com/go-openapi/swag v0.22.4 // indirect
+	github.com/go-playground/locales v0.14.1 // indirect
+	github.com/go-playground/universal-translator v0.18.1 // indirect
+	github.com/go-playground/validator/v10 v10.20.0 // indirect
 	github.com/go-task/slim-sprig/v3 v3.0.0 // indirect
+	github.com/goccy/go-json v0.10.2 // indirect
 	github.com/gogo/protobuf v1.3.2 // indirect
 	github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
 	github.com/golang/protobuf v1.5.4 // indirect
@@ -44,10 +56,14 @@ require (
 	github.com/inconshreveable/mousetrap v1.1.0 // indirect
 	github.com/josharian/intern v1.0.0 // indirect
 	github.com/json-iterator/go v1.1.12 // indirect
+	github.com/klauspost/cpuid/v2 v2.2.7 // indirect
+	github.com/leodido/go-urn v1.4.0 // indirect
 	github.com/mailru/easyjson v0.7.7 // indirect
+	github.com/mattn/go-isatty v0.0.20 // indirect
 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
 	github.com/modern-go/reflect2 v1.0.2 // indirect
 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
+	github.com/pelletier/go-toml/v2 v2.2.2 // indirect
 	github.com/pkg/errors v0.9.1 // indirect
 	github.com/prometheus/client_golang v1.19.1 // indirect
 	github.com/prometheus/client_model v0.6.1 // indirect
@@ -56,6 +72,8 @@ require (
 	github.com/spf13/cobra v1.8.1 // indirect
 	github.com/spf13/pflag v1.0.5 // indirect
 	github.com/stoewer/go-strcase v1.2.0 // indirect
+	github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
+	github.com/ugorji/go/codec v1.2.12 // indirect
 	github.com/x448/float16 v0.8.4 // indirect
 	go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0 // indirect
 	go.opentelemetry.io/otel v1.28.0 // indirect
@@ -67,6 +85,8 @@ require (
 	go.opentelemetry.io/proto/otlp v1.3.1 // indirect
 	go.uber.org/multierr v1.11.0 // indirect
 	go.uber.org/zap v1.26.0 // indirect
+	golang.org/x/arch v0.8.0 // indirect
+	golang.org/x/crypto v0.24.0 // indirect
 	golang.org/x/exp v0.0.0-20230515195305-f3d0a9c9a5cc // indirect
 	golang.org/x/net v0.26.0 // indirect
 	golang.org/x/oauth2 v0.21.0 // indirect
diff --git a/go.sum b/go.sum
index 0958667..96b016b 100644
--- a/go.sum
+++ b/go.sum
@@ -6,10 +6,18 @@ github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
 github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
 github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM=
 github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ=
+github.com/bytedance/sonic v1.11.6 h1:oUp34TzMlL+OY1OUWxHqsdkgC/Zfc85zGqw9siXjrc0=
+github.com/bytedance/sonic v1.11.6/go.mod h1:LysEHSvpvDySVdC2f87zGWf6CIKJcAvqab1ZaiQtds4=
+github.com/bytedance/sonic/loader v0.1.1 h1:c+e5Pt1k/cy5wMveRDyk2X4B9hF4g7an8N3zCYjJFNM=
+github.com/bytedance/sonic/loader v0.1.1/go.mod h1:ncP89zfokxS5LZrJxl5z0UJcsk4M4yY2JpfqGeCtNLU=
 github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8=
 github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE=
 github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
 github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
+github.com/cloudwego/base64x v0.1.4 h1:jwCgWpFanWmN8xoIUHa2rtzmkd5J2plF/dnLS6Xd/0Y=
+github.com/cloudwego/base64x v0.1.4/go.mod h1:0zlkT4Wn5C6NdauXdJRhSKRlJvmclQ1hhJgA0rcu/8w=
+github.com/cloudwego/iasm v0.2.0 h1:1KNIy1I1H9hNNFEEH3DVnI4UujN+1zjpuk6gwHLTssg=
+github.com/cloudwego/iasm v0.2.0/go.mod h1:8rXZaNYT2n95jn+zTI1sDr+IgcD2GVs0nlbbQPiEFhY=
 github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
 github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
@@ -28,6 +36,14 @@ github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nos
 github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM=
 github.com/fxamacker/cbor/v2 v2.7.0 h1:iM5WgngdRBanHcxugY4JySA0nk1wZorNOpTgCMedv5E=
 github.com/fxamacker/cbor/v2 v2.7.0/go.mod h1:pxXPTn3joSm21Gbwsv0w9OSA2y1HFR9qXEeXQVeNoDQ=
+github.com/gabriel-vasile/mimetype v1.4.3 h1:in2uUcidCuFcDKtdcBxlR0rJ1+fsokWf+uqxgUFjbI0=
+github.com/gabriel-vasile/mimetype v1.4.3/go.mod h1:d8uq/6HKRL6CGdk+aubisF/M5GcPfT7nKyLpA0lbSSk=
+github.com/gin-contrib/gzip v1.0.1 h1:HQ8ENHODeLY7a4g1Au/46Z92bdGFl74OhxcZble9WJE=
+github.com/gin-contrib/gzip v1.0.1/go.mod h1:njt428fdUNRvjuJf16tZMYZ2Yl+WQB53X5wmhDwXvC4=
+github.com/gin-contrib/sse v0.1.0 h1:Y/yl/+YNO8GZSjAhjMsSuLt29uWRFHdHYUb5lYOV9qE=
+github.com/gin-contrib/sse v0.1.0/go.mod h1:RHrZQHXnP2xjPF+u1gW/2HnVO7nvIa9PG3Gm+fLHvGI=
+github.com/gin-gonic/gin v1.10.0 h1:nTuyha1TYqgedzytsKYqna+DfLos46nTv2ygFy86HFU=
+github.com/gin-gonic/gin v1.10.0/go.mod h1:4PMNQiOhvDRa013RKVbsiNwoyezlm2rm0uX/T7kzp5Y=
 github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
 github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY=
 github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
@@ -42,8 +58,18 @@ github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En
 github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14=
 github.com/go-openapi/swag v0.22.4 h1:QLMzNJnMGPRNDCbySlcj1x01tzU8/9LTTL9hZZZogBU=
 github.com/go-openapi/swag v0.22.4/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14=
+github.com/go-playground/assert/v2 v2.2.0 h1:JvknZsQTYeFEAhQwI4qEt9cyV5ONwRHC+lYKSsYSR8s=
+github.com/go-playground/assert/v2 v2.2.0/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4=
+github.com/go-playground/locales v0.14.1 h1:EWaQ/wswjilfKLTECiXz7Rh+3BjFhfDFKv/oXslEjJA=
+github.com/go-playground/locales v0.14.1/go.mod h1:hxrqLVvrK65+Rwrd5Fc6F2O76J/NuW9t0sjnWqG1slY=
+github.com/go-playground/universal-translator v0.18.1 h1:Bcnm0ZwsGyWbCzImXv+pAJnYK9S473LQFuzCbDbfSFY=
+github.com/go-playground/universal-translator v0.18.1/go.mod h1:xekY+UJKNuX9WP91TpwSH2VMlDf28Uj24BCp08ZFTUY=
+github.com/go-playground/validator/v10 v10.20.0 h1:K9ISHbSaI0lyB2eWMPJo+kOS/FBExVwjEviJTixqxL8=
+github.com/go-playground/validator/v10 v10.20.0/go.mod h1:dbuPbCMFw/DrkbEynArYaCwl3amGuJotoKCe95atGMM=
 github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI=
 github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8=
+github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU=
+github.com/goccy/go-json v0.10.2/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I=
 github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
 github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
 github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE=
@@ -76,6 +102,10 @@ github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnr
 github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
 github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
 github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
+github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
+github.com/klauspost/cpuid/v2 v2.2.7 h1:ZWSB3igEs+d0qvnxR/ZBzXVmxkgt8DdzP6m9pfuVLDM=
+github.com/klauspost/cpuid/v2 v2.2.7/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws=
+github.com/knz/go-libedit v1.10.1/go.mod h1:MZTVkCWyz0oBc7JOWP3wNAzd002ZbM/5hgShxwh4x8M=
 github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
 github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
 github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
@@ -83,8 +113,12 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
 github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
 github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
 github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
+github.com/leodido/go-urn v1.4.0 h1:WT9HwE9SGECu3lg4d/dIA+jxlljEa1/ffXKmRjqdmIQ=
+github.com/leodido/go-urn v1.4.0/go.mod h1:bvxc+MVxLKB4z00jd1z+Dvzr47oO32F/QSNjSBOlFxI=
 github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
 github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
+github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
+github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
 github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
@@ -96,6 +130,8 @@ github.com/onsi/ginkgo/v2 v2.19.0 h1:9Cnnf7UHo57Hy3k6/m5k3dRfGTMXGvxhHFvkDTCTpvA
 github.com/onsi/ginkgo/v2 v2.19.0/go.mod h1:rlwLi9PilAFJ8jCg9UE1QP6VBpd6/xj3SRC0d6TU0To=
 github.com/onsi/gomega v1.33.1 h1:dsYjIxxSR755MDmKVsaFQTE22ChNBcuuTWgkUDSubOk=
 github.com/onsi/gomega v1.33.1/go.mod h1:U4R44UsT+9eLIaYRB2a5qajjtQYn0hauxvRm16AVYg0=
+github.com/pelletier/go-toml/v2 v2.2.2 h1:aYUidT7k73Pcl9nb2gScu7NSrKCSHIDE89b3+6Wq+LM=
+github.com/pelletier/go-toml/v2 v2.2.2/go.mod h1:1t835xjRzz80PqgE6HHgN2JOsmgYu/h4qDAS4n929Rs=
 github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
 github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
@@ -121,13 +157,20 @@ github.com/stoewer/go-strcase v1.2.0/go.mod h1:IBiWB2sKIp3wVVQ3Y035++gc+knqhUQag
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
 github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
+github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
 github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
 github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA=
+github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
 github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
 github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
 github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
+github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
 github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
 github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
+github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS4MhqMhdFk5YI=
+github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08=
+github.com/ugorji/go/codec v1.2.12 h1:9LC83zGrHhuUA9l16C9AHXAqEV/2wBQ4nkvumAE65EE=
+github.com/ugorji/go/codec v1.2.12/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg=
 github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
 github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
 github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
@@ -154,9 +197,14 @@ go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0=
 go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y=
 go.uber.org/zap v1.26.0 h1:sI7k6L95XOKS281NhVKOFCUNIvv9e0w4BF8N3u+tCRo=
 go.uber.org/zap v1.26.0/go.mod h1:dtElttAiwGvoJ/vj4IwHBS/gXsEu/pZ50mUIRWuG0so=
+golang.org/x/arch v0.0.0-20210923205945-b76863e36670/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8=
+golang.org/x/arch v0.8.0 h1:3wRIsP3pM4yUptoR96otTUOXI367OS0+c9eeRi9doIc=
+golang.org/x/arch v0.8.0/go.mod h1:FEVrYAQjsQXMVJ1nsMoVVXPZg6p2JE2mx8psSWTDQys=
 golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
 golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
+golang.org/x/crypto v0.24.0 h1:mnl8DM0o513X8fdIkmyFE/5hTYxbwYOjDS/+rK6qpRI=
+golang.org/x/crypto v0.24.0/go.mod h1:Z1PMYSOR5nyMcyAVAIQSKCDwalqy85Aqn1x3Ws4L5DM=
 golang.org/x/exp v0.0.0-20230515195305-f3d0a9c9a5cc h1:mCRnTeVUjcrhlRmO0VK8a6k6Rrf6TF9htwo2pJVSjIU=
 golang.org/x/exp v0.0.0-20230515195305-f3d0a9c9a5cc/go.mod h1:V1LtkGg67GoY2N1AnLN78QLrzxkLyJw7RJb1gzOOz9w=
 golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
@@ -177,6 +225,8 @@ golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws=
 golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
 golang.org/x/term v0.21.0 h1:WVXCp+/EBEHOj53Rvu+7KiT/iElMrO8ACK16SMZ3jaA=
@@ -239,6 +289,8 @@ k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 h1:BZqlfIlq5YbRMFko6/PM7F
 k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340/go.mod h1:yD4MZYeKMBwQKVht279WycxKyM84kkAx2DPrTXaeb98=
 k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 h1:pUdcCO1Lk/tbT5ztQWOBi5HBgbBP1J8+AsQnQCKsi8A=
 k8s.io/utils v0.0.0-20240711033017-18e509b52bc8/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
+nullprogram.com/x/optparse v1.0.0/go.mod h1:KdyPE+Igbe0jQUrVfMqDMeJQIJZEuyV7pjYmp6pbG50=
+rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=
 sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.30.3 h1:2770sDpzrjjsAtVhSeUFseziht227YAWYHLGNM8QPwY=
 sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.30.3/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw=
 sigs.k8s.io/controller-runtime v0.19.1 h1:Son+Q40+Be3QWb+niBXAg2vFiYWolDjjRfO8hn/cxOk=
diff --git a/internal/server/router/connection.go b/internal/server/router/connection.go
new file mode 100644
index 0000000..bdc3c3e
--- /dev/null
+++ b/internal/server/router/connection.go
@@ -0,0 +1,145 @@
+package router
+
+import (
+	"context"
+	"fmt"
+	"sync"
+
+	tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
+	"github.com/gin-gonic/gin"
+	"k8s.io/apimachinery/pkg/types"
+	"k8s.io/apimachinery/pkg/watch"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+)
+
+type ConnectionRouter struct {
+	watcher *connectionWatcher
+}
+
+func NewConnectionRouter(ctx context.Context, client client.WithWatch) (*ConnectionRouter, error) {
+	watcher, err := newConnectionWatcher(ctx, client)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create connection watcher: %w", err)
+	}
+	return &ConnectionRouter{watcher: watcher}, nil
+}
+
+func (cr *ConnectionRouter) Get(ctx *gin.Context) {
+	name := ctx.Query("name")
+	namespace := ctx.Query("namespace")
+
+	req := types.NamespacedName{Name: name, Namespace: namespace}
+	conn := cr.watcher.get(ctx, req)
+	if conn == nil {
+		ctx.JSON(404, gin.H{"error": "connection not found"})
+		return
+	}
+
+	// Subscribe to connection updates
+	ch, cancelFunc := cr.watcher.subscribe(req)
+	defer cancelFunc()
+
+	// Wait for connection updates
+	for conn := range ch {
+		if conn.Status.Phase == tfv1.TensorFusionConnectionRunning {
+			ctx.JSON(200, conn.Status.ConnectionURL)
+			return
+		}
+	}
+}
+
+type connectionChannel chan *tfv1.TensorFusionConnection
+type connectionSet map[connectionChannel]struct{}
+type connectionSubscribers map[types.NamespacedName]connectionSet
+
+type connectionWatcher struct {
+	client client.WithWatch
+
+	mu   sync.RWMutex
+	subs connectionSubscribers
+}
+
+func newConnectionWatcher(ctx context.Context, client client.WithWatch) (*connectionWatcher, error) {
+	cw := &connectionWatcher{
+		client: client,
+		subs:   make(connectionSubscribers),
+	}
+	watcher, err := cw.client.Watch(ctx, &tfv1.TensorFusionConnectionList{})
+	if err != nil {
+		return nil, fmt.Errorf("failed to watch connections: %w", err)
+	}
+	go cw.watchConnections(ctx, watcher)
+	return cw, nil
+}
+
+func (cw *connectionWatcher) get(ctx context.Context, req types.NamespacedName) *tfv1.TensorFusionConnection {
+	conn := &tfv1.TensorFusionConnection{}
+	if err := cw.client.Get(ctx, req, conn); err != nil {
+		return nil
+	}
+	return conn
+}
+
+// Subscribe returns a channel that will be closed when the connection is deleted
+func (cw *connectionWatcher) subscribe(req types.NamespacedName) (connectionChannel, func()) {
+	ch := make(connectionChannel, 1)
+
+	cw.mu.Lock()
+	if _, exists := cw.subs[req]; !exists {
+		cw.subs[req] = make(connectionSet)
+	}
+	cw.subs[req][ch] = struct{}{}
+	cw.mu.Unlock()
+
+	cancelFunc := func() {
+		cw.mu.Lock()
+		defer cw.mu.Unlock()
+
+		if chans, exists := cw.subs[req]; exists {
+			delete(chans, ch)
+			close(ch)
+
+			// If no more subscribers, remove the key
+			if len(chans) == 0 {
+				delete(cw.subs, req)
+			}
+		}
+	}
+
+	return ch, cancelFunc
+}
+
+func (cw *connectionWatcher) watchConnections(ctx context.Context, watcher watch.Interface) {
+	// Watch for changes
+	defer watcher.Stop()
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case event, ok := <-watcher.ResultChan():
+			if !ok {
+				return
+			}
+
+			conn, ok := event.Object.(*tfv1.TensorFusionConnection)
+			if !ok {
+				continue
+			}
+
+			// Get the list of subscribers for this connection
+			cw.mu.RLock()
+			key := types.NamespacedName{Name: conn.Name, Namespace: conn.Namespace}
+			if subscribers, exists := cw.subs[key]; exists {
+				// Copy subscribers to avoid holding lock during channel send
+				for ch := range subscribers {
+					select {
+					case ch <- conn:
+					default:
+						// Skip if channel is full
+					}
+				}
+			}
+			cw.mu.RUnlock()
+		}
+	}
+}
diff --git a/internal/server/server.go b/internal/server/server.go
new file mode 100644
index 0000000..fa2995c
--- /dev/null
+++ b/internal/server/server.go
@@ -0,0 +1,19 @@
+package server
+
+import (
+	"github.com/NexusGPU/tensor-fusion-operator/internal/server/router"
+	"github.com/gin-contrib/gzip"
+	"github.com/gin-gonic/gin"
+)
+
+func NewHTTPServer(
+	cr *router.ConnectionRouter,
+) *gin.Engine {
+	r := gin.New()
+	r.Use(gzip.Gzip(gzip.DefaultCompression))
+	r.Use(gin.Recovery())
+
+	apiGroup := r.Group("/api")
+	apiGroup.GET("/connection", cr.Get)
+	return r
+}

From 022b2713ac0ed490edb9c647d6d8462528473cb0 Mon Sep 17 00:00:00 2001
From: 0x5457 <0x5457@protonmail.com>
Date: Tue, 3 Dec 2024 11:00:45 +0000
Subject: [PATCH 06/22] fix typo

---
 internal/controller/gpunode_controller.go | 2 +-
 test/e2e/e2e_test.go                      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/internal/controller/gpunode_controller.go b/internal/controller/gpunode_controller.go
index 265f840..bab8ff5 100644
--- a/internal/controller/gpunode_controller.go
+++ b/internal/controller/gpunode_controller.go
@@ -43,7 +43,7 @@ type GPUNodeReconciler struct {
 // Reconcile is part of the main kubernetes reconciliation loop which aims to
 // move the current state of the cluster closer to the desired state.
 func (r *GPUNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
-	// TOOD: Calculate tflops and update capacity here
+	// TODO: Calculate tflops and update capacity here
 	return ctrl.Result{}, nil
 }
 
diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go
index a218480..aeac7d0 100644
--- a/test/e2e/e2e_test.go
+++ b/test/e2e/e2e_test.go
@@ -278,7 +278,7 @@ func serviceAccountToken() (string, error) {
 
 		// Parse the JSON output to extract the token
 		var token tokenRequest
-		err = json.Unmarshal([]byte(output), &token)
+		err = json.Unmarshal(output, &token)
 		g.Expect(err).NotTo(HaveOccurred())
 
 		out = token.Status.Token

From b5a3b0a7e154fc5941d3ff16be936e12e474d5cb Mon Sep 17 00:00:00 2001
From: 0x5457 <0x5457@protonmail.com>
Date: Wed, 4 Dec 2024 08:05:26 +0000
Subject: [PATCH 07/22] chore: simplify error messages

---
 internal/server/router/connection.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/internal/server/router/connection.go b/internal/server/router/connection.go
index bdc3c3e..1abad0e 100644
--- a/internal/server/router/connection.go
+++ b/internal/server/router/connection.go
@@ -19,7 +19,7 @@ type ConnectionRouter struct {
 func NewConnectionRouter(ctx context.Context, client client.WithWatch) (*ConnectionRouter, error) {
 	watcher, err := newConnectionWatcher(ctx, client)
 	if err != nil {
-		return nil, fmt.Errorf("failed to create connection watcher: %w", err)
+		return nil, fmt.Errorf("create connection watcher: %w", err)
 	}
 	return &ConnectionRouter{watcher: watcher}, nil
 }
@@ -66,7 +66,7 @@ func newConnectionWatcher(ctx context.Context, client client.WithWatch) (*connec
 	}
 	watcher, err := cw.client.Watch(ctx, &tfv1.TensorFusionConnectionList{})
 	if err != nil {
-		return nil, fmt.Errorf("failed to watch connections: %w", err)
+		return nil, fmt.Errorf("watch connections: %w", err)
 	}
 	go cw.watchConnections(ctx, watcher)
 	return cw, nil

From 289d7c6a12b1e818b09694d3246dfc5c9dba1d0f Mon Sep 17 00:00:00 2001
From: 0x5457 <0x5457@protonmail.com>
Date: Thu, 5 Dec 2024 07:21:47 +0000
Subject: [PATCH 08/22] feat: add pod mutation webhook

---
 PROJECT                                       |   8 +
 cmd/main.go                                   |  11 +
 config/certmanager/certificate.yaml           |  35 +++
 config/certmanager/kustomization.yaml         |   5 +
 config/certmanager/kustomizeconfig.yaml       |   8 +
 config/crd/kustomization.yaml                 |   4 +-
 config/crd/patches/cainjection_in_pods.yaml   |   7 +
 config/crd/patches/webhook_in_pods.yaml       |  16 ++
 config/default/kustomization.yaml             |   4 +-
 config/default/manager_webhook_patch.yaml     |  26 ++
 .../network-policy/allow-webhook-traffic.yaml |  26 ++
 config/network-policy/kustomization.yaml      |   1 +
 config/webhook/kustomization.yaml             |   6 +
 config/webhook/kustomizeconfig.yaml           |  22 ++
 config/webhook/service.yaml                   |  15 ++
 go.mod                                        |   2 +-
 internal/config/config.go                     |  16 ++
 internal/webhook/v1/pod_webhook.go            | 226 ++++++++++++++++++
 internal/webhook/v1/pod_webhook_test.go       |  55 +++++
 test/e2e/e2e_test.go                          |  10 +
 20 files changed, 498 insertions(+), 5 deletions(-)
 create mode 100644 config/certmanager/certificate.yaml
 create mode 100644 config/certmanager/kustomization.yaml
 create mode 100644 config/certmanager/kustomizeconfig.yaml
 create mode 100644 config/crd/patches/cainjection_in_pods.yaml
 create mode 100644 config/crd/patches/webhook_in_pods.yaml
 create mode 100644 config/default/manager_webhook_patch.yaml
 create mode 100644 config/network-policy/allow-webhook-traffic.yaml
 create mode 100644 config/webhook/kustomization.yaml
 create mode 100644 config/webhook/kustomizeconfig.yaml
 create mode 100644 config/webhook/service.yaml
 create mode 100644 internal/config/config.go
 create mode 100644 internal/webhook/v1/pod_webhook.go
 create mode 100644 internal/webhook/v1/pod_webhook_test.go

diff --git a/PROJECT b/PROJECT
index 80dae4a..cde7c4f 100644
--- a/PROJECT
+++ b/PROJECT
@@ -26,4 +26,12 @@ resources:
   kind: GPUNode
   path: github.com/NexusGPU/tensor-fusion-operator/api/v1
   version: v1
+- core: true
+  group: core
+  kind: Pod
+  path: k8s.io/api/core/v1
+  version: v1
+  webhooks:
+    conversion: true
+    webhookVersion: v1
 version: "3"
diff --git a/cmd/main.go b/cmd/main.go
index d2d6e85..7f7bf2b 100644
--- a/cmd/main.go
+++ b/cmd/main.go
@@ -38,9 +38,11 @@ import (
 	"sigs.k8s.io/controller-runtime/pkg/webhook"
 
 	tensorfusionaiv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
+	"github.com/NexusGPU/tensor-fusion-operator/internal/config"
 	"github.com/NexusGPU/tensor-fusion-operator/internal/controller"
 	"github.com/NexusGPU/tensor-fusion-operator/internal/server"
 	"github.com/NexusGPU/tensor-fusion-operator/internal/server/router"
+	webhookcorev1 "github.com/NexusGPU/tensor-fusion-operator/internal/webhook/v1"
 	// +kubebuilder:scaffold:imports
 )
 
@@ -148,6 +150,7 @@ func main() {
 	}
 
 	ctx := context.Background()
+	config := config.NewDefaultConfig()
 	if err = (&controller.TensorFusionConnectionReconciler{
 		Client: mgr.GetClient(),
 		Scheme: mgr.GetScheme(),
@@ -162,6 +165,14 @@ func main() {
 		setupLog.Error(err, "unable to create controller", "controller", "GPUNode")
 		os.Exit(1)
 	}
+
+	// nolint:goconst
+	if os.Getenv("ENABLE_WEBHOOKS") != "false" {
+		if err = webhookcorev1.SetupPodWebhookWithManager(mgr, &config.PodMutator); err != nil {
+			setupLog.Error(err, "unable to create webhook", "webhook", "Pod")
+			os.Exit(1)
+		}
+	}
 	// +kubebuilder:scaffold:builder
 
 	if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
diff --git a/config/certmanager/certificate.yaml b/config/certmanager/certificate.yaml
new file mode 100644
index 0000000..7fc10a8
--- /dev/null
+++ b/config/certmanager/certificate.yaml
@@ -0,0 +1,35 @@
+# The following manifests contain a self-signed issuer CR and a certificate CR.
+# More document can be found at https://docs.cert-manager.io
+# WARNING: Targets CertManager v1.0. Check https://cert-manager.io/docs/installation/upgrading/ for breaking changes.
+apiVersion: cert-manager.io/v1
+kind: Issuer
+metadata:
+  labels:
+    app.kubernetes.io/name: tensor-fusion-operator
+    app.kubernetes.io/managed-by: kustomize
+  name: selfsigned-issuer
+  namespace: system
+spec:
+  selfSigned: {}
+---
+apiVersion: cert-manager.io/v1
+kind: Certificate
+metadata:
+  labels:
+    app.kubernetes.io/name: certificate
+    app.kubernetes.io/instance: serving-cert
+    app.kubernetes.io/component: certificate
+    app.kubernetes.io/created-by: tensor-fusion-operator
+    app.kubernetes.io/part-of: tensor-fusion-operator
+    app.kubernetes.io/managed-by: kustomize
+  name: serving-cert  # this name should match the one appeared in kustomizeconfig.yaml
+  namespace: system
+spec:
+  # SERVICE_NAME and SERVICE_NAMESPACE will be substituted by kustomize
+  dnsNames:
+  - SERVICE_NAME.SERVICE_NAMESPACE.svc
+  - SERVICE_NAME.SERVICE_NAMESPACE.svc.cluster.local
+  issuerRef:
+    kind: Issuer
+    name: selfsigned-issuer
+  secretName: webhook-server-cert # this secret will not be prefixed, since it's not managed by kustomize
diff --git a/config/certmanager/kustomization.yaml b/config/certmanager/kustomization.yaml
new file mode 100644
index 0000000..bebea5a
--- /dev/null
+++ b/config/certmanager/kustomization.yaml
@@ -0,0 +1,5 @@
+resources:
+- certificate.yaml
+
+configurations:
+- kustomizeconfig.yaml
diff --git a/config/certmanager/kustomizeconfig.yaml b/config/certmanager/kustomizeconfig.yaml
new file mode 100644
index 0000000..cf6f89e
--- /dev/null
+++ b/config/certmanager/kustomizeconfig.yaml
@@ -0,0 +1,8 @@
+# This configuration is for teaching kustomize how to update name ref substitution
+nameReference:
+- kind: Issuer
+  group: cert-manager.io
+  fieldSpecs:
+  - kind: Certificate
+    group: cert-manager.io
+    path: spec/issuerRef/name
diff --git a/config/crd/kustomization.yaml b/config/crd/kustomization.yaml
index 127a6ba..86141ab 100644
--- a/config/crd/kustomization.yaml
+++ b/config/crd/kustomization.yaml
@@ -17,5 +17,5 @@ patches:
 
 # [WEBHOOK] To enable webhook, uncomment the following section
 # the following config is for teaching kustomize how to do kustomization for CRDs.
-#configurations:
-#- kustomizeconfig.yaml
+configurations:
+- kustomizeconfig.yaml
diff --git a/config/crd/patches/cainjection_in_pods.yaml b/config/crd/patches/cainjection_in_pods.yaml
new file mode 100644
index 0000000..b1ab830
--- /dev/null
+++ b/config/crd/patches/cainjection_in_pods.yaml
@@ -0,0 +1,7 @@
+# The following patch adds a directive for certmanager to inject CA into the CRD
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+  annotations:
+    cert-manager.io/inject-ca-from: CERTIFICATE_NAMESPACE/CERTIFICATE_NAME
+  name: pods.core
diff --git a/config/crd/patches/webhook_in_pods.yaml b/config/crd/patches/webhook_in_pods.yaml
new file mode 100644
index 0000000..8fa5d25
--- /dev/null
+++ b/config/crd/patches/webhook_in_pods.yaml
@@ -0,0 +1,16 @@
+# The following patch enables a conversion webhook for the CRD
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+  name: pods.core
+spec:
+  conversion:
+    strategy: Webhook
+    webhook:
+      clientConfig:
+        service:
+          namespace: system
+          name: webhook-service
+          path: /convert
+      conversionReviewVersions:
+      - v1
diff --git a/config/default/kustomization.yaml b/config/default/kustomization.yaml
index 4f303ca..abb7ff0 100644
--- a/config/default/kustomization.yaml
+++ b/config/default/kustomization.yaml
@@ -20,7 +20,7 @@ resources:
 - ../manager
 # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in
 # crd/kustomization.yaml
-#- ../webhook
+- ../webhook
 # [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER'. 'WEBHOOK' components are required.
 #- ../certmanager
 # [PROMETHEUS] To enable prometheus monitor, uncomment all sections with 'PROMETHEUS'.
@@ -43,7 +43,7 @@ patches:
 
 # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in
 # crd/kustomization.yaml
-#- path: manager_webhook_patch.yaml
+- path: manager_webhook_patch.yaml
 
 # [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER' prefix.
 # Uncomment the following replacements to add the cert-manager CA injection annotations
diff --git a/config/default/manager_webhook_patch.yaml b/config/default/manager_webhook_patch.yaml
new file mode 100644
index 0000000..ad299a5
--- /dev/null
+++ b/config/default/manager_webhook_patch.yaml
@@ -0,0 +1,26 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: controller-manager
+  namespace: system
+  labels:
+    app.kubernetes.io/name: tensor-fusion-operator
+    app.kubernetes.io/managed-by: kustomize
+spec:
+  template:
+    spec:
+      containers:
+      - name: manager
+        ports:
+        - containerPort: 9443
+          name: webhook-server
+          protocol: TCP
+        volumeMounts:
+        - mountPath: /tmp/k8s-webhook-server/serving-certs
+          name: cert
+          readOnly: true
+      volumes:
+      - name: cert
+        secret:
+          defaultMode: 420
+          secretName: webhook-server-cert
diff --git a/config/network-policy/allow-webhook-traffic.yaml b/config/network-policy/allow-webhook-traffic.yaml
new file mode 100644
index 0000000..9076e88
--- /dev/null
+++ b/config/network-policy/allow-webhook-traffic.yaml
@@ -0,0 +1,26 @@
+# This NetworkPolicy allows ingress traffic to your webhook server running
+# as part of the controller-manager from specific namespaces and pods. CR(s) which uses webhooks
+# will only work when applied in namespaces labeled with 'webhook: enabled'
+apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+  labels:
+    app.kubernetes.io/name: tensor-fusion-operator
+    app.kubernetes.io/managed-by: kustomize
+  name: allow-webhook-traffic
+  namespace: system
+spec:
+  podSelector:
+    matchLabels:
+      control-plane: controller-manager
+  policyTypes:
+    - Ingress
+  ingress:
+    # This allows ingress traffic from any namespace with the label webhook: enabled
+    - from:
+      - namespaceSelector:
+          matchLabels:
+            webhook: enabled # Only from namespaces with this label
+      ports:
+        - port: 443
+          protocol: TCP
diff --git a/config/network-policy/kustomization.yaml b/config/network-policy/kustomization.yaml
index ec0fb5e..0872bee 100644
--- a/config/network-policy/kustomization.yaml
+++ b/config/network-policy/kustomization.yaml
@@ -1,2 +1,3 @@
 resources:
+- allow-webhook-traffic.yaml
 - allow-metrics-traffic.yaml
diff --git a/config/webhook/kustomization.yaml b/config/webhook/kustomization.yaml
new file mode 100644
index 0000000..9cf2613
--- /dev/null
+++ b/config/webhook/kustomization.yaml
@@ -0,0 +1,6 @@
+resources:
+- manifests.yaml
+- service.yaml
+
+configurations:
+- kustomizeconfig.yaml
diff --git a/config/webhook/kustomizeconfig.yaml b/config/webhook/kustomizeconfig.yaml
new file mode 100644
index 0000000..206316e
--- /dev/null
+++ b/config/webhook/kustomizeconfig.yaml
@@ -0,0 +1,22 @@
+# the following config is for teaching kustomize where to look at when substituting nameReference.
+# It requires kustomize v2.1.0 or newer to work properly.
+nameReference:
+- kind: Service
+  version: v1
+  fieldSpecs:
+  - kind: MutatingWebhookConfiguration
+    group: admissionregistration.k8s.io
+    path: webhooks/clientConfig/service/name
+  - kind: ValidatingWebhookConfiguration
+    group: admissionregistration.k8s.io
+    path: webhooks/clientConfig/service/name
+
+namespace:
+- kind: MutatingWebhookConfiguration
+  group: admissionregistration.k8s.io
+  path: webhooks/clientConfig/service/namespace
+  create: true
+- kind: ValidatingWebhookConfiguration
+  group: admissionregistration.k8s.io
+  path: webhooks/clientConfig/service/namespace
+  create: true
diff --git a/config/webhook/service.yaml b/config/webhook/service.yaml
new file mode 100644
index 0000000..409f372
--- /dev/null
+++ b/config/webhook/service.yaml
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: Service
+metadata:
+  labels:
+    app.kubernetes.io/name: tensor-fusion-operator
+    app.kubernetes.io/managed-by: kustomize
+  name: webhook-service
+  namespace: system
+spec:
+  ports:
+    - port: 443
+      protocol: TCP
+      targetPort: 9443
+  selector:
+    control-plane: controller-manager
diff --git a/go.mod b/go.mod
index 0b333c1..deab21d 100644
--- a/go.mod
+++ b/go.mod
@@ -7,6 +7,7 @@ require (
 	github.com/gin-gonic/gin v1.10.0
 	github.com/onsi/ginkgo/v2 v2.19.0
 	github.com/onsi/gomega v1.33.1
+	k8s.io/api v0.31.0
 	k8s.io/apimachinery v0.31.0
 	k8s.io/client-go v0.31.0
 	sigs.k8s.io/controller-runtime v0.19.1
@@ -104,7 +105,6 @@ require (
 	gopkg.in/inf.v0 v0.9.1 // indirect
 	gopkg.in/yaml.v2 v2.4.0 // indirect
 	gopkg.in/yaml.v3 v3.0.1 // indirect
-	k8s.io/api v0.31.0 // indirect
 	k8s.io/apiextensions-apiserver v0.31.0 // indirect
 	k8s.io/apiserver v0.31.0 // indirect
 	k8s.io/component-base v0.31.0 // indirect
diff --git a/internal/config/config.go b/internal/config/config.go
new file mode 100644
index 0000000..1146c2b
--- /dev/null
+++ b/internal/config/config.go
@@ -0,0 +1,16 @@
+package config
+
+import corev1 "k8s.io/api/core/v1"
+
+type Config struct {
+	PodMutator PodMutator `json:"podMutator"`
+}
+
+type PodMutator struct {
+	PatchStrategicMerge corev1.Pod      `json:"patchStrategicMerge"`
+	PatchEnvVars        []corev1.EnvVar `json:"envVars"`
+}
+
+func NewDefaultConfig() Config {
+	return Config{}
+}
diff --git a/internal/webhook/v1/pod_webhook.go b/internal/webhook/v1/pod_webhook.go
new file mode 100644
index 0000000..3140b53
--- /dev/null
+++ b/internal/webhook/v1/pod_webhook.go
@@ -0,0 +1,226 @@
+/*
+Copyright 2024.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package v1
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"net/http"
+
+	tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
+	"github.com/NexusGPU/tensor-fusion-operator/internal/config"
+	"gomodules.xyz/jsonpatch/v2"
+	corev1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/strategicpatch"
+	ctrl "sigs.k8s.io/controller-runtime"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+	"sigs.k8s.io/controller-runtime/pkg/log"
+	"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
+)
+
+// SetupPodWebhookWithManager registers the webhook for Pod in the manager.
+func SetupPodWebhookWithManager(mgr ctrl.Manager, config *config.PodMutator) error {
+	webhookServer := mgr.GetWebhookServer()
+	webhookServer.Register("/mutate-v1-pod",
+		&admission.Webhook{
+			Handler: &TensorFusionPodMutator{
+				Config: config,
+				Client: mgr.GetClient(),
+			},
+		})
+	return nil
+}
+
+type TensorFusionPodMutator struct {
+	Client  client.Client
+	Config  *config.PodMutator
+	decoder admission.Decoder
+}
+
+// Handle implements admission.Handler interface.
+func (m *TensorFusionPodMutator) Handle(ctx context.Context, req admission.Request) admission.Response {
+	pod := &corev1.Pod{}
+	if err := m.decoder.Decode(req, pod); err != nil {
+		return admission.Errored(http.StatusBadRequest, err)
+	}
+
+	log := log.FromContext(ctx)
+	log.Info("Mutating pod", "name", pod.Name, "namespace", pod.Namespace)
+
+	reqs := parseTFReq(pod)
+	// 1. Inject initContainer and env variables
+	patches, err := m.patchTFClient(pod, reqs)
+	if err != nil {
+		return admission.Errored(http.StatusInternalServerError, err)
+	}
+
+	// generate tensor fusion connections and apply to cluster
+	tfConnections := generateTensorFusionConnection(pod, reqs)
+
+	for _, tfConnection := range tfConnections {
+		if err := m.Client.Create(ctx, tfConnection); err != nil {
+			log.Error(err, "Failed to create TensorFusionConnection")
+			return admission.Errored(http.StatusInternalServerError, err)
+		}
+	}
+
+	return admission.Patched("tensor fusion component patched", patches...)
+}
+
+// InjectDecoder injects the decoder.
+func (m *TensorFusionPodMutator) InjectDecoder(d admission.Decoder) error {
+	m.decoder = d
+	return nil
+}
+
+type TFReq struct {
+	ContainerName string
+	Tflops        resource.Quantity
+	Vram          resource.Quantity
+}
+
+func parseTFReq(pod *corev1.Pod) []TFReq {
+	if pod.Annotations == nil {
+		return nil
+	}
+
+	reqs := make([]TFReq, 0, len(pod.Spec.Containers))
+
+	for _, container := range pod.Spec.Containers {
+		containerName := container.Name
+
+		// Check if tensor fusion is enabled for this container
+		enableKey := fmt.Sprintf("tensor-fusion.ai/enable-%s", containerName)
+		if enableStr, ok := pod.Annotations[enableKey]; !ok || enableStr != "true" {
+			continue
+		}
+
+		req := TFReq{
+			ContainerName: containerName,
+		}
+
+		// Parse TFLOPS requirement
+		tflopsKey := fmt.Sprintf("tensor-fusion.ai/tflops-%s", containerName)
+		if tflopsStr, ok := pod.Annotations[tflopsKey]; ok {
+			tflops, err := resource.ParseQuantity(tflopsStr)
+			if err == nil {
+				req.Tflops = tflops
+			}
+		}
+
+		// Parse VRAM requirement
+		vramKey := fmt.Sprintf("tensor-fusion.ai/vram-%s", containerName)
+		if vramStr, ok := pod.Annotations[vramKey]; ok {
+			vram, err := resource.ParseQuantity(vramStr)
+			if err == nil {
+				req.Vram = vram
+			}
+		}
+
+		reqs = append(reqs, req)
+	}
+
+	return reqs
+}
+
+func (m *TensorFusionPodMutator) patchTFClient(pod *corev1.Pod, tfReq []TFReq) ([]jsonpatch.JsonPatchOperation, error) {
+	podPatch := m.Config.PatchStrategicMerge
+	// Copy containers
+	podPatch.Spec.Containers = append([]corev1.Container{}, podPatch.Spec.Containers...)
+
+	// Patch env vars
+	for _, req := range tfReq {
+		for _, container := range podPatch.Spec.Containers {
+			if container.Name == req.ContainerName {
+				container.Env = append(container.Env, m.Config.PatchEnvVars...)
+			}
+		}
+	}
+
+	// Convert the strategic merge patch to JSON
+	patchBytes, err := json.Marshal(m.Config.PatchStrategicMerge)
+	if err != nil {
+		return nil, fmt.Errorf("marshal patch: %v", err)
+	}
+
+	// Convert the current pod to JSON
+	currentBytes, err := json.Marshal(pod)
+	if err != nil {
+		return nil, fmt.Errorf("marshal current pod: %v", err)
+	}
+
+	// Apply the strategic merge patch
+	resultBytes, err := strategicpatch.StrategicMergePatch(currentBytes, patchBytes, corev1.Pod{})
+	if err != nil {
+		return nil, fmt.Errorf("apply strategic merge patch: %v", err)
+	}
+
+	// Generate JSON patch operations by comparing original and patched pod
+	patches, err := jsonpatch.CreatePatch(currentBytes, resultBytes)
+	if err != nil {
+		return nil, fmt.Errorf("create json patch: %v", err)
+	}
+
+	// Unmarshal the result back into the pod
+	if err := json.Unmarshal(resultBytes, pod); err != nil {
+		return nil, fmt.Errorf("unmarshal patched pod: %v", err)
+	}
+
+	return patches, nil
+}
+
+func generateTensorFusionConnection(pod *corev1.Pod, tfReq []TFReq) []*tfv1.TensorFusionConnection {
+	connections := make([]*tfv1.TensorFusionConnection, 0, len(tfReq))
+
+	for _, req := range tfReq {
+		connection := &tfv1.TensorFusionConnection{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      fmt.Sprintf("%s-tf-%s", pod.Name, req.ContainerName),
+				Namespace: pod.Namespace,
+				OwnerReferences: []metav1.OwnerReference{
+					{
+						APIVersion: "v1",
+						Kind:       "Pod",
+						Name:       pod.Name,
+						UID:        pod.UID,
+					},
+				},
+			},
+			Spec: tfv1.TensorFusionConnectionSpec{
+				Resources: tfv1.Resources{
+					Request: tfv1.Resource{
+						Tflops: req.Tflops,
+						Vram:   req.Vram,
+					},
+					Limit: tfv1.Resource{
+						Tflops: req.Tflops,
+						Vram:   req.Vram,
+					},
+				},
+			},
+			Status: tfv1.TensorFusionConnectionStatus{
+				Phase: tfv1.TensorFusionConnectionPending,
+			},
+		}
+		connections = append(connections, connection)
+	}
+
+	return connections
+}
diff --git a/internal/webhook/v1/pod_webhook_test.go b/internal/webhook/v1/pod_webhook_test.go
new file mode 100644
index 0000000..b83a9f1
--- /dev/null
+++ b/internal/webhook/v1/pod_webhook_test.go
@@ -0,0 +1,55 @@
+/*
+Copyright 2024.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package v1
+
+import (
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	corev1 "k8s.io/api/core/v1"
+	// TODO (user): Add any additional imports if needed
+)
+
+var _ = Describe("Pod Webhook", func() {
+	var (
+		obj    *corev1.Pod
+		oldObj *corev1.Pod
+	)
+
+	BeforeEach(func() {
+		obj = &corev1.Pod{}
+		oldObj = &corev1.Pod{}
+		Expect(oldObj).NotTo(BeNil(), "Expected oldObj to be initialized")
+		Expect(obj).NotTo(BeNil(), "Expected obj to be initialized")
+		// TODO (user): Add any setup logic common to all tests
+	})
+
+	AfterEach(func() {
+		// TODO (user): Add any teardown logic common to all tests
+	})
+
+	Context("When creating Pod under Conversion Webhook", func() {
+		// TODO (user): Add logic to convert the object to the desired version and verify the conversion
+		// Example:
+		// It("Should convert the object correctly", func() {
+		//     convertedObj := &corev1.Pod{}
+		//     Expect(obj.ConvertTo(convertedObj)).To(Succeed())
+		//     Expect(convertedObj).ToNot(BeNil())
+		// })
+	})
+
+})
diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go
index aeac7d0..67c63dc 100644
--- a/test/e2e/e2e_test.go
+++ b/test/e2e/e2e_test.go
@@ -234,6 +234,16 @@ var _ = Describe("Manager", Ordered, func() {
 			))
 		})
 
+		It("should provisioned cert-manager", func() {
+			By("validating that cert-manager has the certificate Secret")
+			verifyCertManager := func(g Gomega) {
+				cmd := exec.Command("kubectl", "get", "secrets", "webhook-server-cert", "-n", namespace)
+				_, err := utils.Run(cmd)
+				g.Expect(err).NotTo(HaveOccurred())
+			}
+			Eventually(verifyCertManager).Should(Succeed())
+		})
+
 		// +kubebuilder:scaffold:e2e-webhooks-checks
 
 		// TODO: Customize the e2e test suite with scenarios specific to your project.

From 4a2ec8bf3d862f567754827c2de043ccd6420f57 Mon Sep 17 00:00:00 2001
From: 0x5457 <0x5457@protonmail.com>
Date: Thu, 5 Dec 2024 07:41:27 +0000
Subject: [PATCH 09/22] feat: implement resource cleanup for
 TensorFusionConnection

- Add finalizer handling in TensorFusionConnection controller
- Implement Release method in NaiveScheduler for resource cleanup

This change ensures proper cleanup of GPU resources when a
TensorFusionConnection is deleted, preventing resource leaks.
---
 api/v1/tensorfusionconnection_types.go        |  1 +
 internal/constants/constants.go               | 15 ++++
 .../tensorfusionconnection_controller.go      | 79 ++++++++++++++++++
 internal/scheduler/naive.go                   | 15 ++++
 internal/scheduler/naive_test.go              | 80 +++++++++++++++++++
 internal/scheduler/scheduler.go               |  3 +
 internal/webhook/v1/pod_webhook.go            |  8 +-
 7 files changed, 198 insertions(+), 3 deletions(-)
 create mode 100644 internal/constants/constants.go

diff --git a/api/v1/tensorfusionconnection_types.go b/api/v1/tensorfusionconnection_types.go
index 955f227..00af5f2 100644
--- a/api/v1/tensorfusionconnection_types.go
+++ b/api/v1/tensorfusionconnection_types.go
@@ -49,6 +49,7 @@ type TensorFusionConnectionStatus struct {
 	Phase         TensorFusionConnectionPhase `json:"phase"`
 	ConnectionURL string                      `json:"connectionURL"`
 	QosClass      string                      `json:"qosClass"`
+	Node          string                      `json:"node,omitempty"`
 }
 
 // +kubebuilder:object:root=true
diff --git a/internal/constants/constants.go b/internal/constants/constants.go
new file mode 100644
index 0000000..acf7eee
--- /dev/null
+++ b/internal/constants/constants.go
@@ -0,0 +1,15 @@
+package constants
+
+const (
+	// TensorFusionDomain is the domain prefix used for all tensor-fusion.ai related annotations and finalizers
+	TensorFusionDomain = "tensor-fusion.ai"
+
+	// Finalizer constants
+	TensorFusionFinalizerSuffix = "finalizer"
+	TensorFusionFinalizer       = TensorFusionDomain + "/" + TensorFusionFinalizerSuffix
+
+	// Annotation key constants
+	EnableContainerAnnotationFormat = TensorFusionDomain + "/enable-%s"
+	TFLOPSContainerAnnotationFormat = TensorFusionDomain + "/tflops-%s"
+	VRAMContainerAnnotationFormat   = TensorFusionDomain + "/vram-%s"
+)
diff --git a/internal/controller/tensorfusionconnection_controller.go b/internal/controller/tensorfusionconnection_controller.go
index b9223f4..8e14687 100644
--- a/internal/controller/tensorfusionconnection_controller.go
+++ b/internal/controller/tensorfusionconnection_controller.go
@@ -27,6 +27,7 @@ import (
 	"sigs.k8s.io/controller-runtime/pkg/log"
 
 	tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
+	"github.com/NexusGPU/tensor-fusion-operator/internal/constants"
 	scheduler "github.com/NexusGPU/tensor-fusion-operator/internal/scheduler"
 	"github.com/NexusGPU/tensor-fusion-operator/internal/worker"
 )
@@ -38,6 +39,10 @@ type TensorFusionConnectionReconciler struct {
 	Scheduler scheduler.Scheduler
 }
 
+var (
+	tensorFusionConnectionFinalizer = constants.TensorFusionFinalizer
+)
+
 // +kubebuilder:rbac:groups=tensor-fusion.ai.tensor-fusion.ai,resources=tensorfusionconnections,verbs=get;list;watch;create;update;patch;delete
 // +kubebuilder:rbac:groups=tensor-fusion.ai.tensor-fusion.ai,resources=tensorfusionconnections/status,verbs=get;update;patch
 // +kubebuilder:rbac:groups=tensor-fusion.ai.tensor-fusion.ai,resources=tensorfusionconnections/finalizers,verbs=update
@@ -58,6 +63,35 @@ func (r *TensorFusionConnectionReconciler) Reconcile(ctx context.Context, req ct
 		return ctrl.Result{}, err
 	}
 
+	// Check if the connection is being deleted
+	if !connection.DeletionTimestamp.IsZero() {
+		// The object is being deleted
+		if containsString(connection.Finalizers, tensorFusionConnectionFinalizer) {
+			// Our finalizer is present, so let's handle our external dependency
+			if err := r.handleDeletion(ctx, connection); err != nil {
+				return ctrl.Result{}, err
+			}
+
+			// Remove our finalizer from the list and update it
+			connection.Finalizers = removeString(connection.Finalizers, tensorFusionConnectionFinalizer)
+			if err := r.Update(ctx, connection); err != nil {
+				return ctrl.Result{}, err
+			}
+		}
+		// Our finalizer has finished, so the reconciler can do nothing
+		return ctrl.Result{}, nil
+	}
+
+	// Add finalizer if it's not present
+	if !containsString(connection.Finalizers, tensorFusionConnectionFinalizer) {
+		connection.Finalizers = append(connection.Finalizers, tensorFusionConnectionFinalizer)
+		if err := r.Update(ctx, connection); err != nil {
+			return ctrl.Result{}, err
+		}
+		// Return here as the update will trigger another reconciliation
+		return ctrl.Result{}, nil
+	}
+
 	var node *tfv1.GPUNode
 	// If status is not set or pending, try to schedule
 	if connection.Status.Phase == "" || connection.Status.Phase == tfv1.TensorFusionConnectionPending {
@@ -69,6 +103,7 @@ func (r *TensorFusionConnectionReconciler) Reconcile(ctx context.Context, req ct
 		} else if node != nil {
 			connection.Status.Phase = tfv1.TensorFusionConnectionRunning
 			connection.Status.ConnectionURL = worker.GenerateConnectionURL(node, connection)
+			connection.Status.Node = node.Name // Store the node name for cleanup
 		} else {
 			connection.Status.Phase = tfv1.TensorFusionConnectionPending
 		}
@@ -81,6 +116,50 @@ func (r *TensorFusionConnectionReconciler) Reconcile(ctx context.Context, req ct
 	return ctrl.Result{}, nil
 }
 
+// handleDeletion handles cleanup of external dependencies
+func (r *TensorFusionConnectionReconciler) handleDeletion(ctx context.Context, connection *tfv1.TensorFusionConnection) error {
+	if connection.Status.Node == "" {
+		return nil // No node was allocated, nothing to clean up
+	}
+
+	// Get the node
+	node := &tfv1.GPUNode{}
+	if err := r.Get(ctx, client.ObjectKey{Name: connection.Status.Node}, node); err != nil {
+		if errors.IsNotFound(err) {
+			// Node is already gone, nothing to do
+			return nil
+		}
+		return err
+	}
+
+	// Release the resources
+	if err := r.Scheduler.Release(node); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+// Helper functions to handle finalizers
+func containsString(slice []string, s string) bool {
+	for _, item := range slice {
+		if item == s {
+			return true
+		}
+	}
+	return false
+}
+
+func removeString(slice []string, s string) []string {
+	result := []string{}
+	for _, item := range slice {
+		if item != s {
+			result = append(result, item)
+		}
+	}
+	return result
+}
+
 func (r *TensorFusionConnectionReconciler) MustUpdateStatus(ctx context.Context, connection *tfv1.TensorFusionConnection, gpuNode *tfv1.GPUNode) error {
 	return retry.RetryOnConflict(retry.DefaultBackoff, func() error {
 		// Get the latest version of the connection
diff --git a/internal/scheduler/naive.go b/internal/scheduler/naive.go
index 8d9a4d3..0192173 100644
--- a/internal/scheduler/naive.go
+++ b/internal/scheduler/naive.go
@@ -58,3 +58,18 @@ func (s *NaiveScheduler) OnDelete(node *tfv1.GPUNode) {
 	defer s.Unlock()
 	delete(s.nodes, node.Name)
 }
+
+// Release implements Scheduler interface
+func (s *NaiveScheduler) Release(node *tfv1.GPUNode) error {
+	s.Lock()
+	defer s.Unlock()
+
+	existingNode, ok := s.nodes[node.Name]
+	if !ok {
+		return fmt.Errorf("node %s not found", node.Name)
+	}
+
+	// Reset the node's available resources to its capacity
+	existingNode.Status.Available = existingNode.Status.Capacity
+	return nil
+}
diff --git a/internal/scheduler/naive_test.go b/internal/scheduler/naive_test.go
index be1affd..f6367a2 100644
--- a/internal/scheduler/naive_test.go
+++ b/internal/scheduler/naive_test.go
@@ -164,3 +164,83 @@ func TestNaiveScheduler_NodeOperations(t *testing.T) {
 		t.Error("After OnDelete: Schedule() should fail with no nodes")
 	}
 }
+
+func TestNaiveScheduler_Release(t *testing.T) {
+	tests := []struct {
+		name      string
+		node      *tfv1.GPUNode
+		schedule  *tfv1.Resource
+		wantError bool
+	}{
+		{
+			name:      "release non-existent node",
+			node:      createGPUNode("node1", "100", "16Gi"),
+			wantError: true,
+		},
+		{
+			name: "release after scheduling",
+			node: &tfv1.GPUNode{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "node1",
+				},
+				Status: tfv1.GPUNodeStatus{
+					Capacity: tfv1.Resource{
+						Tflops: resource.MustParse("100"),
+						Vram:   resource.MustParse("16Gi"),
+					},
+					Available: tfv1.Resource{
+						Tflops: resource.MustParse("100"),
+						Vram:   resource.MustParse("16Gi"),
+					},
+				},
+			},
+			schedule: &tfv1.Resource{
+				Tflops: resource.MustParse("50"),
+				Vram:   resource.MustParse("8Gi"),
+			},
+			wantError: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			s := NewNaiveScheduler()
+
+			if !tt.wantError {
+				// Add the node first
+				s.OnAdd(tt.node)
+
+				// Schedule some resources if needed
+				if tt.schedule != nil {
+					node, err := s.Schedule(*tt.schedule)
+					if err != nil {
+						t.Errorf("Schedule() error = %v", err)
+						return
+					}
+
+					// Verify resources were allocated
+					if node.Status.Available.Tflops.Cmp(resource.MustParse("50")) != 0 ||
+						node.Status.Available.Vram.Cmp(resource.MustParse("8Gi")) != 0 {
+						t.Errorf("Schedule() did not allocate resources correctly")
+						return
+					}
+				}
+			}
+
+			err := s.Release(tt.node)
+			if (err != nil) != tt.wantError {
+				t.Errorf("Release() error = %v, wantError %v", err, tt.wantError)
+				return
+			}
+
+			if !tt.wantError {
+				// Verify resources were restored
+				node := s.nodes[tt.node.Name]
+				if node.Status.Available.Tflops.Cmp(node.Status.Capacity.Tflops) != 0 ||
+					node.Status.Available.Vram.Cmp(node.Status.Capacity.Vram) != 0 {
+					t.Errorf("Release() did not restore resources correctly")
+				}
+			}
+		})
+	}
+}
diff --git a/internal/scheduler/scheduler.go b/internal/scheduler/scheduler.go
index 47ac776..163124c 100644
--- a/internal/scheduler/scheduler.go
+++ b/internal/scheduler/scheduler.go
@@ -11,6 +11,9 @@ type Scheduler interface {
 	// an nil pointer and an error.
 	Schedule(request tfv1.Resource) (*tfv1.GPUNode, error)
 
+	// Release frees the allocated resources of a node
+	Release(node *tfv1.GPUNode) error
+
 	// OnAdd is called when a new node is added
 	OnAdd(node *tfv1.GPUNode)
 	// OnUpdate is called when a node is modified
diff --git a/internal/webhook/v1/pod_webhook.go b/internal/webhook/v1/pod_webhook.go
index 3140b53..8edcb70 100644
--- a/internal/webhook/v1/pod_webhook.go
+++ b/internal/webhook/v1/pod_webhook.go
@@ -33,6 +33,8 @@ import (
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/log"
 	"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
+
+	"github.com/NexusGPU/tensor-fusion-operator/internal/constants"
 )
 
 // SetupPodWebhookWithManager registers the webhook for Pod in the manager.
@@ -107,7 +109,7 @@ func parseTFReq(pod *corev1.Pod) []TFReq {
 		containerName := container.Name
 
 		// Check if tensor fusion is enabled for this container
-		enableKey := fmt.Sprintf("tensor-fusion.ai/enable-%s", containerName)
+		enableKey := fmt.Sprintf(constants.EnableContainerAnnotationFormat, containerName)
 		if enableStr, ok := pod.Annotations[enableKey]; !ok || enableStr != "true" {
 			continue
 		}
@@ -117,7 +119,7 @@ func parseTFReq(pod *corev1.Pod) []TFReq {
 		}
 
 		// Parse TFLOPS requirement
-		tflopsKey := fmt.Sprintf("tensor-fusion.ai/tflops-%s", containerName)
+		tflopsKey := fmt.Sprintf(constants.TFLOPSContainerAnnotationFormat, containerName)
 		if tflopsStr, ok := pod.Annotations[tflopsKey]; ok {
 			tflops, err := resource.ParseQuantity(tflopsStr)
 			if err == nil {
@@ -126,7 +128,7 @@ func parseTFReq(pod *corev1.Pod) []TFReq {
 		}
 
 		// Parse VRAM requirement
-		vramKey := fmt.Sprintf("tensor-fusion.ai/vram-%s", containerName)
+		vramKey := fmt.Sprintf(constants.VRAMContainerAnnotationFormat, containerName)
 		if vramStr, ok := pod.Annotations[vramKey]; ok {
 			vram, err := resource.ParseQuantity(vramStr)
 			if err == nil {

From 7a24ba3112cb01d13a07a919d420b240ed95b92b Mon Sep 17 00:00:00 2001
From: 0x5457 <0x5457@protonmail.com>
Date: Thu, 5 Dec 2024 07:56:15 +0000
Subject: [PATCH 10/22] fix lint

---
 internal/scheduler/naive_test.go |  1 +
 test/e2e/e2e_test.go             | 12 ++++++------
 test/utils/utils.go              |  4 ++--
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/internal/scheduler/naive_test.go b/internal/scheduler/naive_test.go
index f6367a2..823f8c4 100644
--- a/internal/scheduler/naive_test.go
+++ b/internal/scheduler/naive_test.go
@@ -22,6 +22,7 @@ func createGPUNode(name string, tflops, vram string) *tfv1.GPUNode {
 	}
 }
 
+//nolint:unparam
 func createRequest(tflops, vram string) tfv1.Resource {
 	return tfv1.Resource{
 		Tflops: resource.MustParse(tflops),
diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go
index 67c63dc..f5ff7c5 100644
--- a/test/e2e/e2e_test.go
+++ b/test/e2e/e2e_test.go
@@ -93,27 +93,27 @@ var _ = Describe("Manager", Ordered, func() {
 			cmd := exec.Command("kubectl", "logs", controllerPodName, "-n", namespace)
 			controllerLogs, err := utils.Run(cmd)
 			if err == nil {
-				_, _ = fmt.Fprintf(GinkgoWriter, fmt.Sprintf("Controller logs:\n %s", controllerLogs))
+				_, _ = fmt.Fprintf(GinkgoWriter, "Controller logs:\n %s", controllerLogs)
 			} else {
-				_, _ = fmt.Fprintf(GinkgoWriter, fmt.Sprintf("Failed to get Controller logs: %s", err))
+				_, _ = fmt.Fprintf(GinkgoWriter, "Failed to get Controller logs: %s", err)
 			}
 
 			By("Fetching Kubernetes events")
 			cmd = exec.Command("kubectl", "get", "events", "-n", namespace, "--sort-by=.lastTimestamp")
 			eventsOutput, err := utils.Run(cmd)
 			if err == nil {
-				_, _ = fmt.Fprintf(GinkgoWriter, fmt.Sprintf("Kubernetes events:\n%s", eventsOutput))
+				_, _ = fmt.Fprintf(GinkgoWriter, "Kubernetes events:\n%s", eventsOutput)
 			} else {
-				_, _ = fmt.Fprintf(GinkgoWriter, fmt.Sprintf("Failed to get Kubernetes events: %s", err))
+				_, _ = fmt.Fprintf(GinkgoWriter, "Failed to get Kubernetes events: %s", err)
 			}
 
 			By("Fetching curl-metrics logs")
 			cmd = exec.Command("kubectl", "logs", "curl-metrics", "-n", namespace)
 			metricsOutput, err := utils.Run(cmd)
 			if err == nil {
-				_, _ = fmt.Fprintf(GinkgoWriter, fmt.Sprintf("Metrics logs:\n %s", metricsOutput))
+				_, _ = fmt.Fprintf(GinkgoWriter, "Metrics logs:\n %s", metricsOutput)
 			} else {
-				_, _ = fmt.Fprintf(GinkgoWriter, fmt.Sprintf("Failed to get curl-metrics logs: %s", err))
+				_, _ = fmt.Fprintf(GinkgoWriter, "Failed to get curl-metrics logs: %s", err)
 			}
 
 			By("Fetching controller manager pod description")
diff --git a/test/utils/utils.go b/test/utils/utils.go
index c3d51ce..8319bc4 100644
--- a/test/utils/utils.go
+++ b/test/utils/utils.go
@@ -92,7 +92,7 @@ func IsPrometheusCRDsInstalled() bool {
 	if err != nil {
 		return false
 	}
-	crdList := GetNonEmptyLines(string(output))
+	crdList := GetNonEmptyLines(output)
 	for _, crd := range prometheusCRDs {
 		for _, line := range crdList {
 			if strings.Contains(line, crd) {
@@ -153,7 +153,7 @@ func IsCertManagerCRDsInstalled() bool {
 	}
 
 	// Check if any of the Cert Manager CRDs are present
-	crdList := GetNonEmptyLines(string(output))
+	crdList := GetNonEmptyLines(output)
 	for _, crd := range certManagerCRDs {
 		for _, line := range crdList {
 			if strings.Contains(line, crd) {

From a6f7ca835f4b89ab7b1b4ba57892e7cac4528927 Mon Sep 17 00:00:00 2001
From: 0x5457 <0x5457@protonmail.com>
Date: Thu, 5 Dec 2024 16:42:13 +0000
Subject: [PATCH 11/22] chore: init naive scheduler

---
 cmd/main.go                                           | 10 +++++++---
 ...n.ai.tensor-fusion.ai_tensorfusionconnections.yaml |  2 ++
 internal/controller/gpunode_controller.go             | 11 -----------
 3 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/cmd/main.go b/cmd/main.go
index 7f7bf2b..3ed420c 100644
--- a/cmd/main.go
+++ b/cmd/main.go
@@ -40,6 +40,7 @@ import (
 	tensorfusionaiv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
 	"github.com/NexusGPU/tensor-fusion-operator/internal/config"
 	"github.com/NexusGPU/tensor-fusion-operator/internal/controller"
+	"github.com/NexusGPU/tensor-fusion-operator/internal/scheduler"
 	"github.com/NexusGPU/tensor-fusion-operator/internal/server"
 	"github.com/NexusGPU/tensor-fusion-operator/internal/server/router"
 	webhookcorev1 "github.com/NexusGPU/tensor-fusion-operator/internal/webhook/v1"
@@ -158,9 +159,12 @@ func main() {
 		setupLog.Error(err, "unable to create controller", "controller", "TensorFusionConnection")
 		os.Exit(1)
 	}
+
+	scheduler := scheduler.NewNaiveScheduler()
 	if err = (&controller.GPUNodeReconciler{
-		Client: mgr.GetClient(),
-		Scheme: mgr.GetScheme(),
+		Client:    mgr.GetClient(),
+		Scheme:    mgr.GetScheme(),
+		Scheduler: scheduler,
 	}).SetupWithManager(ctx, mgr); err != nil {
 		setupLog.Error(err, "unable to create controller", "controller", "GPUNode")
 		os.Exit(1)
@@ -173,8 +177,8 @@ func main() {
 			os.Exit(1)
 		}
 	}
-	// +kubebuilder:scaffold:builder
 
+	// +kubebuilder:scaffold:builder
 	if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
 		setupLog.Error(err, "unable to set up health check")
 		os.Exit(1)
diff --git a/config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_tensorfusionconnections.yaml b/config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_tensorfusionconnections.yaml
index 135776a..c9d1ec9 100644
--- a/config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_tensorfusionconnections.yaml
+++ b/config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_tensorfusionconnections.yaml
@@ -91,6 +91,8 @@ spec:
             properties:
               connectionURL:
                 type: string
+              node:
+                type: string
               phase:
                 type: string
               qosClass:
diff --git a/internal/controller/gpunode_controller.go b/internal/controller/gpunode_controller.go
index bab8ff5..2c1e2a9 100644
--- a/internal/controller/gpunode_controller.go
+++ b/internal/controller/gpunode_controller.go
@@ -49,17 +49,6 @@ func (r *GPUNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
 
 // SetupWithManager sets up the controller with the Manager.
 func (r *GPUNodeReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager) error {
-	// List all existing GPUNodes and add them to scheduler
-	existingNodes := &tfv1.GPUNodeList{}
-	if err := r.List(ctx, existingNodes); err != nil {
-		return err
-	}
-
-	// Add all existing nodes to scheduler
-	for i := range existingNodes.Items {
-		r.Scheduler.OnAdd(&existingNodes.Items[i])
-	}
-
 	return ctrl.NewControllerManagedBy(mgr).
 		For(&tfv1.GPUNode{}).
 		Named("gpunode").

From 9155aabf755d223fd8fdb0d427d8a46ab5ea72e7 Mon Sep 17 00:00:00 2001
From: 0x5457 <0x5457@protonmail.com>
Date: Fri, 6 Dec 2024 01:08:39 +0800
Subject: [PATCH 12/22] chmore: rename group

---
 PROJECT                                       |   2 -
 README.md                                     | 114 +++++++++++++++++-
 api/v1/groupversion_info.go                   |   4 +-
 ...es.yaml => tensor-fusion.ai_gpunodes.yaml} |   4 +-
 ...or-fusion.ai_tensorfusionconnections.yaml} |   4 +-
 config/crd/kustomization.yaml                 |   4 +-
 config/rbac/gpunode_editor_role.yaml          |   4 +-
 config/rbac/gpunode_viewer_role.yaml          |   4 +-
 config/rbac/role.yaml                         |   6 +-
 .../tensorfusionconnection_editor_role.yaml   |   4 +-
 .../tensorfusionconnection_viewer_role.yaml   |   4 +-
 config/samples/kustomization.yaml             |   4 +-
 ...ion.ai_v1_gpunode.yaml => v1_gpunode.yaml} |   2 +-
 ...on.yaml => v1_tensorfusionconnection.yaml} |   2 +-
 internal/controller/gpunode_controller.go     |   6 +-
 .../tensorfusionconnection_controller.go      |   6 +-
 16 files changed, 142 insertions(+), 32 deletions(-)
 rename config/crd/bases/{tensor-fusion.ai.tensor-fusion.ai_gpunodes.yaml => tensor-fusion.ai_gpunodes.yaml} (97%)
 rename config/crd/bases/{tensor-fusion.ai.tensor-fusion.ai_tensorfusionconnections.yaml => tensor-fusion.ai_tensorfusionconnections.yaml} (97%)
 rename config/samples/{tensor-fusion.ai_v1_gpunode.yaml => v1_gpunode.yaml} (79%)
 rename config/samples/{tensor-fusion.ai_v1_tensorfusionconnection.yaml => v1_tensorfusionconnection.yaml} (81%)

diff --git a/PROJECT b/PROJECT
index cde7c4f..6730b7f 100644
--- a/PROJECT
+++ b/PROJECT
@@ -13,7 +13,6 @@ resources:
     namespaced: true
   controller: true
   domain: tensor-fusion.ai
-  group: tensor-fusion.ai
   kind: TensorFusionConnection
   path: github.com/NexusGPU/tensor-fusion-operator/api/v1
   version: v1
@@ -22,7 +21,6 @@ resources:
     namespaced: true
   controller: true
   domain: tensor-fusion.ai
-  group: tensor-fusion.ai
   kind: GPUNode
   path: github.com/NexusGPU/tensor-fusion-operator/api/v1
   version: v1
diff --git a/README.md b/README.md
index f1370f4..f2daa2f 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,114 @@
 # tensor-fusion-operator
-Tensor Fusion operator including custom resources, admission webhooks, metrics aggregators, cluster management APIs, cloud integration etc.
+// TODO(user): Add simple overview of use/purpose
+
+## Description
+// TODO(user): An in-depth paragraph about your project and overview of use
+
+## Getting Started
+
+### Prerequisites
+- go version v1.22.0+
+- docker version 17.03+.
+- kubectl version v1.11.3+.
+- Access to a Kubernetes v1.11.3+ cluster.
+
+### To Deploy on the cluster
+**Build and push your image to the location specified by `IMG`:**
+
+```sh
+make docker-build docker-push IMG=<some-registry>/tensor-fusion-operator:tag
+```
+
+**NOTE:** This image ought to be published in the personal registry you specified.
+And it is required to have access to pull the image from the working environment.
+Make sure you have the proper permission to the registry if the above commands don’t work.
+
+**Install the CRDs into the cluster:**
+
+```sh
+make install
+```
+
+**Deploy the Manager to the cluster with the image specified by `IMG`:**
+
+```sh
+make deploy IMG=<some-registry>/tensor-fusion-operator:tag
+```
+
+> **NOTE**: If you encounter RBAC errors, you may need to grant yourself cluster-admin
+privileges or be logged in as admin.
+
+**Create instances of your solution**
+You can apply the samples (examples) from the config/sample:
+
+```sh
+kubectl apply -k config/samples/
+```
+
+>**NOTE**: Ensure that the samples has default values to test it out.
+
+### To Uninstall
+**Delete the instances (CRs) from the cluster:**
+
+```sh
+kubectl delete -k config/samples/
+```
+
+**Delete the APIs(CRDs) from the cluster:**
+
+```sh
+make uninstall
+```
+
+**UnDeploy the controller from the cluster:**
+
+```sh
+make undeploy
+```
+
+## Project Distribution
+
+Following are the steps to build the installer and distribute this project to users.
+
+1. Build the installer for the image built and published in the registry:
+
+```sh
+make build-installer IMG=<some-registry>/tensor-fusion-operator:tag
+```
+
+NOTE: The makefile target mentioned above generates an 'install.yaml'
+file in the dist directory. This file contains all the resources built
+with Kustomize, which are necessary to install this project without
+its dependencies.
+
+2. Using the installer
+
+Users can just run kubectl apply -f <URL for YAML BUNDLE> to install the project, i.e.:
+
+```sh
+kubectl apply -f https://raw.githubusercontent.com/<org>/tensor-fusion-operator/<tag or branch>/dist/install.yaml
+```
+
+## Contributing
+// TODO(user): Add detailed information on how you would like others to contribute to this project
+
+**NOTE:** Run `make help` for more information on all potential `make` targets
+
+More information can be found via the [Kubebuilder Documentation](https://book.kubebuilder.io/introduction.html)
+
+## License
+
+Copyright 2024.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
diff --git a/api/v1/groupversion_info.go b/api/v1/groupversion_info.go
index 9172ec6..72aaccc 100644
--- a/api/v1/groupversion_info.go
+++ b/api/v1/groupversion_info.go
@@ -16,7 +16,7 @@ limitations under the License.
 
 // Package v1 contains API Schema definitions for the tensor-fusion.ai v1 API group.
 // +kubebuilder:object:generate=true
-// +groupName=tensor-fusion.ai.tensor-fusion.ai
+// +groupName=tensor-fusion.ai
 package v1
 
 import (
@@ -26,7 +26,7 @@ import (
 
 var (
 	// GroupVersion is group version used to register these objects.
-	GroupVersion = schema.GroupVersion{Group: "tensor-fusion.ai.tensor-fusion.ai", Version: "v1"}
+	GroupVersion = schema.GroupVersion{Group: "tensor-fusion.ai", Version: "v1"}
 
 	// SchemeBuilder is used to add go types to the GroupVersionKind scheme.
 	SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion}
diff --git a/config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_gpunodes.yaml b/config/crd/bases/tensor-fusion.ai_gpunodes.yaml
similarity index 97%
rename from config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_gpunodes.yaml
rename to config/crd/bases/tensor-fusion.ai_gpunodes.yaml
index 4829b1f..34442aa 100644
--- a/config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_gpunodes.yaml
+++ b/config/crd/bases/tensor-fusion.ai_gpunodes.yaml
@@ -4,9 +4,9 @@ kind: CustomResourceDefinition
 metadata:
   annotations:
     controller-gen.kubebuilder.io/version: v0.16.4
-  name: gpunodes.tensor-fusion.ai.tensor-fusion.ai
+  name: gpunodes.tensor-fusion.ai
 spec:
-  group: tensor-fusion.ai.tensor-fusion.ai
+  group: tensor-fusion.ai
   names:
     kind: GPUNode
     listKind: GPUNodeList
diff --git a/config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_tensorfusionconnections.yaml b/config/crd/bases/tensor-fusion.ai_tensorfusionconnections.yaml
similarity index 97%
rename from config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_tensorfusionconnections.yaml
rename to config/crd/bases/tensor-fusion.ai_tensorfusionconnections.yaml
index c9d1ec9..9fb2714 100644
--- a/config/crd/bases/tensor-fusion.ai.tensor-fusion.ai_tensorfusionconnections.yaml
+++ b/config/crd/bases/tensor-fusion.ai_tensorfusionconnections.yaml
@@ -4,9 +4,9 @@ kind: CustomResourceDefinition
 metadata:
   annotations:
     controller-gen.kubebuilder.io/version: v0.16.4
-  name: tensorfusionconnections.tensor-fusion.ai.tensor-fusion.ai
+  name: tensorfusionconnections.tensor-fusion.ai
 spec:
-  group: tensor-fusion.ai.tensor-fusion.ai
+  group: tensor-fusion.ai
   names:
     kind: TensorFusionConnection
     listKind: TensorFusionConnectionList
diff --git a/config/crd/kustomization.yaml b/config/crd/kustomization.yaml
index 86141ab..ef965fc 100644
--- a/config/crd/kustomization.yaml
+++ b/config/crd/kustomization.yaml
@@ -2,8 +2,8 @@
 # since it depends on service name and namespace that are out of this kustomize package.
 # It should be run by config/default
 resources:
-- bases/tensor-fusion.ai.tensor-fusion.ai_tensorfusionconnections.yaml
-- bases/tensor-fusion.ai.tensor-fusion.ai_gpunodes.yaml
+- bases/tensor-fusion.ai_tensorfusionconnections.yaml
+- bases/tensor-fusion.ai_gpunodes.yaml
 # +kubebuilder:scaffold:crdkustomizeresource
 
 patches:
diff --git a/config/rbac/gpunode_editor_role.yaml b/config/rbac/gpunode_editor_role.yaml
index 11c1526..10e6ec1 100644
--- a/config/rbac/gpunode_editor_role.yaml
+++ b/config/rbac/gpunode_editor_role.yaml
@@ -8,7 +8,7 @@ metadata:
   name: gpunode-editor-role
 rules:
 - apiGroups:
-  - tensor-fusion.ai.tensor-fusion.ai
+  - tensor-fusion.ai
   resources:
   - gpunodes
   verbs:
@@ -20,7 +20,7 @@ rules:
   - update
   - watch
 - apiGroups:
-  - tensor-fusion.ai.tensor-fusion.ai
+  - tensor-fusion.ai
   resources:
   - gpunodes/status
   verbs:
diff --git a/config/rbac/gpunode_viewer_role.yaml b/config/rbac/gpunode_viewer_role.yaml
index a4808a0..376b12f 100644
--- a/config/rbac/gpunode_viewer_role.yaml
+++ b/config/rbac/gpunode_viewer_role.yaml
@@ -8,7 +8,7 @@ metadata:
   name: gpunode-viewer-role
 rules:
 - apiGroups:
-  - tensor-fusion.ai.tensor-fusion.ai
+  - tensor-fusion.ai
   resources:
   - gpunodes
   verbs:
@@ -16,7 +16,7 @@ rules:
   - list
   - watch
 - apiGroups:
-  - tensor-fusion.ai.tensor-fusion.ai
+  - tensor-fusion.ai
   resources:
   - gpunodes/status
   verbs:
diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml
index b5d3369..a2a838e 100644
--- a/config/rbac/role.yaml
+++ b/config/rbac/role.yaml
@@ -5,7 +5,7 @@ metadata:
   name: manager-role
 rules:
 - apiGroups:
-  - tensor-fusion.ai.tensor-fusion.ai
+  - tensor-fusion.ai
   resources:
   - gpunodes
   - tensorfusionconnections
@@ -18,14 +18,14 @@ rules:
   - update
   - watch
 - apiGroups:
-  - tensor-fusion.ai.tensor-fusion.ai
+  - tensor-fusion.ai
   resources:
   - gpunodes/finalizers
   - tensorfusionconnections/finalizers
   verbs:
   - update
 - apiGroups:
-  - tensor-fusion.ai.tensor-fusion.ai
+  - tensor-fusion.ai
   resources:
   - gpunodes/status
   - tensorfusionconnections/status
diff --git a/config/rbac/tensorfusionconnection_editor_role.yaml b/config/rbac/tensorfusionconnection_editor_role.yaml
index d7627ed..dd1c5ff 100644
--- a/config/rbac/tensorfusionconnection_editor_role.yaml
+++ b/config/rbac/tensorfusionconnection_editor_role.yaml
@@ -8,7 +8,7 @@ metadata:
   name: tensorfusionconnection-editor-role
 rules:
 - apiGroups:
-  - tensor-fusion.ai.tensor-fusion.ai
+  - tensor-fusion.ai
   resources:
   - tensorfusionconnections
   verbs:
@@ -20,7 +20,7 @@ rules:
   - update
   - watch
 - apiGroups:
-  - tensor-fusion.ai.tensor-fusion.ai
+  - tensor-fusion.ai
   resources:
   - tensorfusionconnections/status
   verbs:
diff --git a/config/rbac/tensorfusionconnection_viewer_role.yaml b/config/rbac/tensorfusionconnection_viewer_role.yaml
index 498b61e..e93e3c0 100644
--- a/config/rbac/tensorfusionconnection_viewer_role.yaml
+++ b/config/rbac/tensorfusionconnection_viewer_role.yaml
@@ -8,7 +8,7 @@ metadata:
   name: tensorfusionconnection-viewer-role
 rules:
 - apiGroups:
-  - tensor-fusion.ai.tensor-fusion.ai
+  - tensor-fusion.ai
   resources:
   - tensorfusionconnections
   verbs:
@@ -16,7 +16,7 @@ rules:
   - list
   - watch
 - apiGroups:
-  - tensor-fusion.ai.tensor-fusion.ai
+  - tensor-fusion.ai
   resources:
   - tensorfusionconnections/status
   verbs:
diff --git a/config/samples/kustomization.yaml b/config/samples/kustomization.yaml
index 022e343..387e950 100644
--- a/config/samples/kustomization.yaml
+++ b/config/samples/kustomization.yaml
@@ -1,5 +1,5 @@
 ## Append samples of your project ##
 resources:
-- tensor-fusion.ai_v1_tensorfusionconnection.yaml
-- tensor-fusion.ai_v1_gpunode.yaml
+- v1_tensorfusionconnection.yaml
+- v1_gpunode.yaml
 # +kubebuilder:scaffold:manifestskustomizesamples
diff --git a/config/samples/tensor-fusion.ai_v1_gpunode.yaml b/config/samples/v1_gpunode.yaml
similarity index 79%
rename from config/samples/tensor-fusion.ai_v1_gpunode.yaml
rename to config/samples/v1_gpunode.yaml
index 0957bdb..0a5d491 100644
--- a/config/samples/tensor-fusion.ai_v1_gpunode.yaml
+++ b/config/samples/v1_gpunode.yaml
@@ -1,4 +1,4 @@
-apiVersion: tensor-fusion.ai.tensor-fusion.ai/v1
+apiVersion: tensor-fusion.ai/v1
 kind: GPUNode
 metadata:
   labels:
diff --git a/config/samples/tensor-fusion.ai_v1_tensorfusionconnection.yaml b/config/samples/v1_tensorfusionconnection.yaml
similarity index 81%
rename from config/samples/tensor-fusion.ai_v1_tensorfusionconnection.yaml
rename to config/samples/v1_tensorfusionconnection.yaml
index 91c2a95..3eb2690 100644
--- a/config/samples/tensor-fusion.ai_v1_tensorfusionconnection.yaml
+++ b/config/samples/v1_tensorfusionconnection.yaml
@@ -1,4 +1,4 @@
-apiVersion: tensor-fusion.ai.tensor-fusion.ai/v1
+apiVersion: tensor-fusion.ai/v1
 kind: TensorFusionConnection
 metadata:
   labels:
diff --git a/internal/controller/gpunode_controller.go b/internal/controller/gpunode_controller.go
index 2c1e2a9..caea975 100644
--- a/internal/controller/gpunode_controller.go
+++ b/internal/controller/gpunode_controller.go
@@ -36,9 +36,9 @@ type GPUNodeReconciler struct {
 	Scheduler scheduler.Scheduler
 }
 
-// +kubebuilder:rbac:groups=tensor-fusion.ai.tensor-fusion.ai,resources=gpunodes,verbs=get;list;watch;create;update;patch;delete
-// +kubebuilder:rbac:groups=tensor-fusion.ai.tensor-fusion.ai,resources=gpunodes/status,verbs=get;update;patch
-// +kubebuilder:rbac:groups=tensor-fusion.ai.tensor-fusion.ai,resources=gpunodes/finalizers,verbs=update
+// +kubebuilder:rbac:groups=tensor-fusion.ai,resources=gpunodes,verbs=get;list;watch;create;update;patch;delete
+// +kubebuilder:rbac:groups=tensor-fusion.ai,resources=gpunodes/status,verbs=get;update;patch
+// +kubebuilder:rbac:groups=tensor-fusion.ai,resources=gpunodes/finalizers,verbs=update
 
 // Reconcile is part of the main kubernetes reconciliation loop which aims to
 // move the current state of the cluster closer to the desired state.
diff --git a/internal/controller/tensorfusionconnection_controller.go b/internal/controller/tensorfusionconnection_controller.go
index 8e14687..d546459 100644
--- a/internal/controller/tensorfusionconnection_controller.go
+++ b/internal/controller/tensorfusionconnection_controller.go
@@ -43,9 +43,9 @@ var (
 	tensorFusionConnectionFinalizer = constants.TensorFusionFinalizer
 )
 
-// +kubebuilder:rbac:groups=tensor-fusion.ai.tensor-fusion.ai,resources=tensorfusionconnections,verbs=get;list;watch;create;update;patch;delete
-// +kubebuilder:rbac:groups=tensor-fusion.ai.tensor-fusion.ai,resources=tensorfusionconnections/status,verbs=get;update;patch
-// +kubebuilder:rbac:groups=tensor-fusion.ai.tensor-fusion.ai,resources=tensorfusionconnections/finalizers,verbs=update
+// +kubebuilder:rbac:groups=tensor-fusion.ai,resources=tensorfusionconnections,verbs=get;list;watch;create;update;patch;delete
+// +kubebuilder:rbac:groups=tensor-fusion.ai,resources=tensorfusionconnections/status,verbs=get;update;patch
+// +kubebuilder:rbac:groups=tensor-fusion.ai,resources=tensorfusionconnections/finalizers,verbs=update
 
 // Reconcile is part of the main kubernetes reconciliation loop which aims to
 // move the current state of the cluster closer to the desired state.

From e63deee4d67541f6e8d8218885bc1fde5421ca02 Mon Sep 17 00:00:00 2001
From: 0x5457 <0x5457@protonmail.com>
Date: Fri, 6 Dec 2024 05:38:11 +0000
Subject: [PATCH 13/22] feat(webhook): implement pod mutating webhook and test
 infrastructure

- Add webhook manifests for pod mutation
- Remove conversion webhook config from PROJECT file
- Implement webhook test suite with ginkgo framework
- Update pod webhook tests to focus on defaulting instead of conversion
- Add CA injection verification in e2e tests
---
 PROJECT                                   |   1 -
 config/webhook/manifests.yaml             |  26 ++++
 internal/webhook/v1/pod_webhook_test.go   |  22 ++--
 internal/webhook/v1/webhook_suite_test.go | 149 ++++++++++++++++++++++
 test/e2e/e2e_test.go                      |  14 ++
 5 files changed, 203 insertions(+), 9 deletions(-)
 create mode 100644 config/webhook/manifests.yaml
 create mode 100644 internal/webhook/v1/webhook_suite_test.go

diff --git a/PROJECT b/PROJECT
index 6730b7f..cc6a157 100644
--- a/PROJECT
+++ b/PROJECT
@@ -30,6 +30,5 @@ resources:
   path: k8s.io/api/core/v1
   version: v1
   webhooks:
-    conversion: true
     webhookVersion: v1
 version: "3"
diff --git a/config/webhook/manifests.yaml b/config/webhook/manifests.yaml
new file mode 100644
index 0000000..ec70061
--- /dev/null
+++ b/config/webhook/manifests.yaml
@@ -0,0 +1,26 @@
+---
+apiVersion: admissionregistration.k8s.io/v1
+kind: MutatingWebhookConfiguration
+metadata:
+  name: mutating-webhook-configuration
+webhooks:
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: webhook-service
+      namespace: system
+      path: /mutate--v1-pod
+  failurePolicy: Fail
+  name: mpod-v1.kb.io
+  rules:
+  - apiGroups:
+    - ""
+    apiVersions:
+    - v1
+    operations:
+    - CREATE
+    - UPDATE
+    resources:
+    - pods
+  sideEffects: None
diff --git a/internal/webhook/v1/pod_webhook_test.go b/internal/webhook/v1/pod_webhook_test.go
index b83a9f1..1fb7a60 100644
--- a/internal/webhook/v1/pod_webhook_test.go
+++ b/internal/webhook/v1/pod_webhook_test.go
@@ -26,13 +26,16 @@ import (
 
 var _ = Describe("Pod Webhook", func() {
 	var (
-		obj    *corev1.Pod
-		oldObj *corev1.Pod
+		obj       *corev1.Pod
+		oldObj    *corev1.Pod
+		defaulter PodCustomDefaulter
 	)
 
 	BeforeEach(func() {
 		obj = &corev1.Pod{}
 		oldObj = &corev1.Pod{}
+		defaulter = PodCustomDefaulter{}
+		Expect(defaulter).NotTo(BeNil(), "Expected defaulter to be initialized")
 		Expect(oldObj).NotTo(BeNil(), "Expected oldObj to be initialized")
 		Expect(obj).NotTo(BeNil(), "Expected obj to be initialized")
 		// TODO (user): Add any setup logic common to all tests
@@ -42,13 +45,16 @@ var _ = Describe("Pod Webhook", func() {
 		// TODO (user): Add any teardown logic common to all tests
 	})
 
-	Context("When creating Pod under Conversion Webhook", func() {
-		// TODO (user): Add logic to convert the object to the desired version and verify the conversion
+	Context("When creating Pod under Defaulting Webhook", func() {
+		// TODO (user): Add logic for defaulting webhooks
 		// Example:
-		// It("Should convert the object correctly", func() {
-		//     convertedObj := &corev1.Pod{}
-		//     Expect(obj.ConvertTo(convertedObj)).To(Succeed())
-		//     Expect(convertedObj).ToNot(BeNil())
+		// It("Should apply defaults when a required field is empty", func() {
+		//     By("simulating a scenario where defaults should be applied")
+		//     obj.SomeFieldWithDefault = ""
+		//     By("calling the Default method to apply defaults")
+		//     defaulter.Default(ctx, obj)
+		//     By("checking that the default values are set")
+		//     Expect(obj.SomeFieldWithDefault).To(Equal("default_value"))
 		// })
 	})
 
diff --git a/internal/webhook/v1/webhook_suite_test.go b/internal/webhook/v1/webhook_suite_test.go
new file mode 100644
index 0000000..2ab08ea
--- /dev/null
+++ b/internal/webhook/v1/webhook_suite_test.go
@@ -0,0 +1,149 @@
+/*
+Copyright 2024.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package v1
+
+import (
+	"context"
+	"crypto/tls"
+	"fmt"
+	"net"
+	"path/filepath"
+	"runtime"
+	"testing"
+	"time"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	admissionv1 "k8s.io/api/admission/v1"
+	corev1 "k8s.io/api/core/v1"
+
+	// +kubebuilder:scaffold:imports
+	apimachineryruntime "k8s.io/apimachinery/pkg/runtime"
+	"k8s.io/client-go/rest"
+	ctrl "sigs.k8s.io/controller-runtime"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+	"sigs.k8s.io/controller-runtime/pkg/envtest"
+	logf "sigs.k8s.io/controller-runtime/pkg/log"
+	"sigs.k8s.io/controller-runtime/pkg/log/zap"
+	metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
+	"sigs.k8s.io/controller-runtime/pkg/webhook"
+)
+
+// These tests use Ginkgo (BDD-style Go testing framework). Refer to
+// http://onsi.github.io/ginkgo/ to learn more about Ginkgo.
+
+var (
+	cancel    context.CancelFunc
+	cfg       *rest.Config
+	ctx       context.Context
+	k8sClient client.Client
+	testEnv   *envtest.Environment
+)
+
+func TestAPIs(t *testing.T) {
+	RegisterFailHandler(Fail)
+
+	RunSpecs(t, "Webhook Suite")
+}
+
+var _ = BeforeSuite(func() {
+	logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true)))
+
+	ctx, cancel = context.WithCancel(context.TODO())
+
+	By("bootstrapping test environment")
+	testEnv = &envtest.Environment{
+		CRDDirectoryPaths:     []string{filepath.Join("..", "..", "..", "config", "crd", "bases")},
+		ErrorIfCRDPathMissing: false,
+
+		// The BinaryAssetsDirectory is only required if you want to run the tests directly
+		// without call the makefile target test. If not informed it will look for the
+		// default path defined in controller-runtime which is /usr/local/kubebuilder/.
+		// Note that you must have the required binaries setup under the bin directory to perform
+		// the tests directly. When we run make test it will be setup and used automatically.
+		BinaryAssetsDirectory: filepath.Join("..", "..", "..", "bin", "k8s",
+			fmt.Sprintf("1.31.0-%s-%s", runtime.GOOS, runtime.GOARCH)),
+
+		WebhookInstallOptions: envtest.WebhookInstallOptions{
+			Paths: []string{filepath.Join("..", "..", "..", "config", "webhook")},
+		},
+	}
+
+	var err error
+	// cfg is defined in this file globally.
+	cfg, err = testEnv.Start()
+	Expect(err).NotTo(HaveOccurred())
+	Expect(cfg).NotTo(BeNil())
+
+	scheme := apimachineryruntime.NewScheme()
+	err = corev1.AddToScheme(scheme)
+	Expect(err).NotTo(HaveOccurred())
+
+	err = admissionv1.AddToScheme(scheme)
+	Expect(err).NotTo(HaveOccurred())
+
+	// +kubebuilder:scaffold:scheme
+
+	k8sClient, err = client.New(cfg, client.Options{Scheme: scheme})
+	Expect(err).NotTo(HaveOccurred())
+	Expect(k8sClient).NotTo(BeNil())
+
+	// start webhook server using Manager.
+	webhookInstallOptions := &testEnv.WebhookInstallOptions
+	mgr, err := ctrl.NewManager(cfg, ctrl.Options{
+		Scheme: scheme,
+		WebhookServer: webhook.NewServer(webhook.Options{
+			Host:    webhookInstallOptions.LocalServingHost,
+			Port:    webhookInstallOptions.LocalServingPort,
+			CertDir: webhookInstallOptions.LocalServingCertDir,
+		}),
+		LeaderElection: false,
+		Metrics:        metricsserver.Options{BindAddress: "0"},
+	})
+	Expect(err).NotTo(HaveOccurred())
+
+	err = SetupPodWebhookWithManager(mgr)
+	Expect(err).NotTo(HaveOccurred())
+
+	// +kubebuilder:scaffold:webhook
+
+	go func() {
+		defer GinkgoRecover()
+		err = mgr.Start(ctx)
+		Expect(err).NotTo(HaveOccurred())
+	}()
+
+	// wait for the webhook server to get ready.
+	dialer := &net.Dialer{Timeout: time.Second}
+	addrPort := fmt.Sprintf("%s:%d", webhookInstallOptions.LocalServingHost, webhookInstallOptions.LocalServingPort)
+	Eventually(func() error {
+		conn, err := tls.DialWithDialer(dialer, "tcp", addrPort, &tls.Config{InsecureSkipVerify: true})
+		if err != nil {
+			return err
+		}
+
+		return conn.Close()
+	}).Should(Succeed())
+})
+
+var _ = AfterSuite(func() {
+	By("tearing down the test environment")
+	cancel()
+	err := testEnv.Stop()
+	Expect(err).NotTo(HaveOccurred())
+})
diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go
index f5ff7c5..b886624 100644
--- a/test/e2e/e2e_test.go
+++ b/test/e2e/e2e_test.go
@@ -244,6 +244,20 @@ var _ = Describe("Manager", Ordered, func() {
 			Eventually(verifyCertManager).Should(Succeed())
 		})
 
+		It("should have CA injection for mutating webhooks", func() {
+			By("checking CA injection for mutating webhooks")
+			verifyCAInjection := func(g Gomega) {
+				cmd := exec.Command("kubectl", "get",
+					"mutatingwebhookconfigurations.admissionregistration.k8s.io",
+					"tensor-fusion-operator-mutating-webhook-configuration",
+					"-o", "go-template={{ range .webhooks }}{{ .clientConfig.caBundle }}{{ end }}")
+				mwhOutput, err := utils.Run(cmd)
+				g.Expect(err).NotTo(HaveOccurred())
+				g.Expect(len(mwhOutput)).To(BeNumerically(">", 10))
+			}
+			Eventually(verifyCAInjection).Should(Succeed())
+		})
+
 		// +kubebuilder:scaffold:e2e-webhooks-checks
 
 		// TODO: Customize the e2e test suite with scenarios specific to your project.

From 4bc43c45eeee51fc0a5899798b9dcfc4cda84c4a Mon Sep 17 00:00:00 2001
From: 0x5457 <0x5457@protonmail.com>
Date: Fri, 6 Dec 2024 05:40:52 +0000
Subject: [PATCH 14/22] fix tests

---
 internal/webhook/v1/pod_webhook_test.go   | 7 ++-----
 internal/webhook/v1/webhook_suite_test.go | 4 +++-
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/internal/webhook/v1/pod_webhook_test.go b/internal/webhook/v1/pod_webhook_test.go
index 1fb7a60..44e1981 100644
--- a/internal/webhook/v1/pod_webhook_test.go
+++ b/internal/webhook/v1/pod_webhook_test.go
@@ -26,16 +26,13 @@ import (
 
 var _ = Describe("Pod Webhook", func() {
 	var (
-		obj       *corev1.Pod
-		oldObj    *corev1.Pod
-		defaulter PodCustomDefaulter
+		obj    *corev1.Pod
+		oldObj *corev1.Pod
 	)
 
 	BeforeEach(func() {
 		obj = &corev1.Pod{}
 		oldObj = &corev1.Pod{}
-		defaulter = PodCustomDefaulter{}
-		Expect(defaulter).NotTo(BeNil(), "Expected defaulter to be initialized")
 		Expect(oldObj).NotTo(BeNil(), "Expected oldObj to be initialized")
 		Expect(obj).NotTo(BeNil(), "Expected obj to be initialized")
 		// TODO (user): Add any setup logic common to all tests
diff --git a/internal/webhook/v1/webhook_suite_test.go b/internal/webhook/v1/webhook_suite_test.go
index 2ab08ea..119c947 100644
--- a/internal/webhook/v1/webhook_suite_test.go
+++ b/internal/webhook/v1/webhook_suite_test.go
@@ -26,6 +26,7 @@ import (
 	"testing"
 	"time"
 
+	"github.com/NexusGPU/tensor-fusion-operator/internal/config"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 
@@ -117,7 +118,8 @@ var _ = BeforeSuite(func() {
 	})
 	Expect(err).NotTo(HaveOccurred())
 
-	err = SetupPodWebhookWithManager(mgr)
+	conf := config.NewDefaultConfig()
+	err = SetupPodWebhookWithManager(mgr, &conf.PodMutator)
 	Expect(err).NotTo(HaveOccurred())
 
 	// +kubebuilder:scaffold:webhook

From 4d3bc2df69208bb2b756af5e99739eb09c64d74b Mon Sep 17 00:00:00 2001
From: 0x5457 <0x5457@protonmail.com>
Date: Fri, 6 Dec 2024 08:15:51 +0000
Subject: [PATCH 15/22] feat(scheduler): implement naive scheduler release with
 resource tracking

- Update Release method in naive scheduler to properly track and restore resources
- Add requeue duration for pending connections
- Fix status updates in TensorFusionConnection controller
- Update tests to verify partial and full resource releases
- Make QosClass optional in TensorFusionConnection status
---
 api/v1/tensorfusionconnection_types.go        |   2 +-
 cmd/main.go                                   |   7 +-
 ...sor-fusion.ai_tensorfusionconnections.yaml |   1 -
 config/manager/kustomization.yaml             |   2 +
 config/samples/v1_gpunode.yaml                |  12 +-
 config/samples/v1_tensorfusionconnection.yaml |   9 +-
 config/webhook/kustomization.yaml             |   3 +
 go.mod                                        |   3 +-
 internal/constants/constants.go               |   4 +
 .../tensorfusionconnection_controller.go      |  20 +-
 internal/scheduler/naive.go                   |   9 +-
 internal/scheduler/naive_test.go              |  71 ++++++--
 internal/scheduler/scheduler.go               |   2 +-
 internal/webhook/v1/pod_webhook_test.go       | 171 +++++++++++++++---
 14 files changed, 259 insertions(+), 57 deletions(-)

diff --git a/api/v1/tensorfusionconnection_types.go b/api/v1/tensorfusionconnection_types.go
index 00af5f2..5594e92 100644
--- a/api/v1/tensorfusionconnection_types.go
+++ b/api/v1/tensorfusionconnection_types.go
@@ -48,7 +48,7 @@ const (
 type TensorFusionConnectionStatus struct {
 	Phase         TensorFusionConnectionPhase `json:"phase"`
 	ConnectionURL string                      `json:"connectionURL"`
-	QosClass      string                      `json:"qosClass"`
+	QosClass      string                      `json:"qosClass,omitempty"`
 	Node          string                      `json:"node,omitempty"`
 }
 
diff --git a/cmd/main.go b/cmd/main.go
index 3ed420c..eb3e6a7 100644
--- a/cmd/main.go
+++ b/cmd/main.go
@@ -152,15 +152,16 @@ func main() {
 
 	ctx := context.Background()
 	config := config.NewDefaultConfig()
+	scheduler := scheduler.NewNaiveScheduler()
 	if err = (&controller.TensorFusionConnectionReconciler{
-		Client: mgr.GetClient(),
-		Scheme: mgr.GetScheme(),
+		Client:    mgr.GetClient(),
+		Scheme:    mgr.GetScheme(),
+		Scheduler: scheduler,
 	}).SetupWithManager(mgr); err != nil {
 		setupLog.Error(err, "unable to create controller", "controller", "TensorFusionConnection")
 		os.Exit(1)
 	}
 
-	scheduler := scheduler.NewNaiveScheduler()
 	if err = (&controller.GPUNodeReconciler{
 		Client:    mgr.GetClient(),
 		Scheme:    mgr.GetScheme(),
diff --git a/config/crd/bases/tensor-fusion.ai_tensorfusionconnections.yaml b/config/crd/bases/tensor-fusion.ai_tensorfusionconnections.yaml
index 9fb2714..43407d0 100644
--- a/config/crd/bases/tensor-fusion.ai_tensorfusionconnections.yaml
+++ b/config/crd/bases/tensor-fusion.ai_tensorfusionconnections.yaml
@@ -100,7 +100,6 @@ spec:
             required:
             - connectionURL
             - phase
-            - qosClass
             type: object
         type: object
     served: true
diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml
index 5c5f0b8..be49923 100644
--- a/config/manager/kustomization.yaml
+++ b/config/manager/kustomization.yaml
@@ -1,2 +1,4 @@
+namespace: tensor-fusion
+
 resources:
 - manager.yaml
diff --git a/config/samples/v1_gpunode.yaml b/config/samples/v1_gpunode.yaml
index 0a5d491..484525d 100644
--- a/config/samples/v1_gpunode.yaml
+++ b/config/samples/v1_gpunode.yaml
@@ -5,5 +5,13 @@ metadata:
     app.kubernetes.io/name: tensor-fusion-operator
     app.kubernetes.io/managed-by: kustomize
   name: gpunode-sample
-spec:
-  # TODO(user): Add fields here
+  namespace: tensor-fusion
+status:
+  capacity:
+    tflops: '200'
+    vram: 100Gi
+  available:
+    tflops: '200'
+    vram: 100Gi
+  devices: []
+  
\ No newline at end of file
diff --git a/config/samples/v1_tensorfusionconnection.yaml b/config/samples/v1_tensorfusionconnection.yaml
index 3eb2690..cc634ff 100644
--- a/config/samples/v1_tensorfusionconnection.yaml
+++ b/config/samples/v1_tensorfusionconnection.yaml
@@ -5,5 +5,12 @@ metadata:
     app.kubernetes.io/name: tensor-fusion-operator
     app.kubernetes.io/managed-by: kustomize
   name: tensorfusionconnection-sample
+  namespace: tensor-fusion
 spec:
-  # TODO(user): Add fields here
+  resources:
+    limit:
+      tflops: '100'
+      vram: 8Gi
+    request:
+      tflops: '20'
+      vram: 9Gi
diff --git a/config/webhook/kustomization.yaml b/config/webhook/kustomization.yaml
index 9cf2613..3535f61 100644
--- a/config/webhook/kustomization.yaml
+++ b/config/webhook/kustomization.yaml
@@ -1,6 +1,9 @@
+namespace: tensor-fusion
+
 resources:
 - manifests.yaml
 - service.yaml
 
 configurations:
 - kustomizeconfig.yaml
+
diff --git a/go.mod b/go.mod
index deab21d..bbffb49 100644
--- a/go.mod
+++ b/go.mod
@@ -7,6 +7,7 @@ require (
 	github.com/gin-gonic/gin v1.10.0
 	github.com/onsi/ginkgo/v2 v2.19.0
 	github.com/onsi/gomega v1.33.1
+	gomodules.xyz/jsonpatch/v2 v2.4.0
 	k8s.io/api v0.31.0
 	k8s.io/apimachinery v0.31.0
 	k8s.io/client-go v0.31.0
@@ -97,11 +98,11 @@ require (
 	golang.org/x/text v0.16.0 // indirect
 	golang.org/x/time v0.3.0 // indirect
 	golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d // indirect
-	gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
 	google.golang.org/genproto/googleapis/api v0.0.0-20240528184218-531527333157 // indirect
 	google.golang.org/genproto/googleapis/rpc v0.0.0-20240701130421-f6361c86f094 // indirect
 	google.golang.org/grpc v1.65.0 // indirect
 	google.golang.org/protobuf v1.34.2 // indirect
+	gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
 	gopkg.in/inf.v0 v0.9.1 // indirect
 	gopkg.in/yaml.v2 v2.4.0 // indirect
 	gopkg.in/yaml.v3 v3.0.1 // indirect
diff --git a/internal/constants/constants.go b/internal/constants/constants.go
index acf7eee..130a29d 100644
--- a/internal/constants/constants.go
+++ b/internal/constants/constants.go
@@ -1,5 +1,7 @@
 package constants
 
+import "time"
+
 const (
 	// TensorFusionDomain is the domain prefix used for all tensor-fusion.ai related annotations and finalizers
 	TensorFusionDomain = "tensor-fusion.ai"
@@ -12,4 +14,6 @@ const (
 	EnableContainerAnnotationFormat = TensorFusionDomain + "/enable-%s"
 	TFLOPSContainerAnnotationFormat = TensorFusionDomain + "/tflops-%s"
 	VRAMContainerAnnotationFormat   = TensorFusionDomain + "/vram-%s"
+
+	PendingRequeueDuration = time.Second * 3
 )
diff --git a/internal/controller/tensorfusionconnection_controller.go b/internal/controller/tensorfusionconnection_controller.go
index d546459..47c6397 100644
--- a/internal/controller/tensorfusionconnection_controller.go
+++ b/internal/controller/tensorfusionconnection_controller.go
@@ -96,14 +96,16 @@ func (r *TensorFusionConnectionReconciler) Reconcile(ctx context.Context, req ct
 	// If status is not set or pending, try to schedule
 	if connection.Status.Phase == "" || connection.Status.Phase == tfv1.TensorFusionConnectionPending {
 		// Try to get an available node from scheduler
-		node, err := r.Scheduler.Schedule(connection.Spec.Resources.Request)
+		var err error
+		node, err = r.Scheduler.Schedule(connection.Spec.Resources.Request)
 		if err != nil {
-			log.Error(err, "Failed to schedule connection")
+			log.Info(err.Error())
 			connection.Status.Phase = tfv1.TensorFusionConnectionPending
 		} else if node != nil {
 			connection.Status.Phase = tfv1.TensorFusionConnectionRunning
 			connection.Status.ConnectionURL = worker.GenerateConnectionURL(node, connection)
-			connection.Status.Node = node.Name // Store the node name for cleanup
+			// Store the node name for cleanup
+			connection.Status.Node = node.Name
 		} else {
 			connection.Status.Phase = tfv1.TensorFusionConnectionPending
 		}
@@ -113,6 +115,9 @@ func (r *TensorFusionConnectionReconciler) Reconcile(ctx context.Context, req ct
 		return ctrl.Result{}, err
 	}
 
+	if connection.Status.Phase == tfv1.TensorFusionConnectionPending {
+		return ctrl.Result{RequeueAfter: constants.PendingRequeueDuration}, nil
+	}
 	return ctrl.Result{}, nil
 }
 
@@ -124,7 +129,7 @@ func (r *TensorFusionConnectionReconciler) handleDeletion(ctx context.Context, c
 
 	// Get the node
 	node := &tfv1.GPUNode{}
-	if err := r.Get(ctx, client.ObjectKey{Name: connection.Status.Node}, node); err != nil {
+	if err := r.Get(ctx, client.ObjectKey{Name: connection.Status.Node, Namespace: connection.Namespace}, node); err != nil {
 		if errors.IsNotFound(err) {
 			// Node is already gone, nothing to do
 			return nil
@@ -133,11 +138,11 @@ func (r *TensorFusionConnectionReconciler) handleDeletion(ctx context.Context, c
 	}
 
 	// Release the resources
-	if err := r.Scheduler.Release(node); err != nil {
+	if err := r.Scheduler.Release(connection.Spec.Resources.Request, node); err != nil {
 		return err
 	}
 
-	return nil
+	return r.MustUpdateStatus(ctx, connection, node)
 }
 
 // Helper functions to handle finalizers
@@ -172,8 +177,7 @@ func (r *TensorFusionConnectionReconciler) MustUpdateStatus(ctx context.Context,
 		}
 
 		// Update the status fields we care about
-		latestConnection.Status.Phase = connection.Status.Phase
-		latestConnection.Status.ConnectionURL = connection.Status.ConnectionURL
+		latestConnection.Status = connection.Status
 
 		// Update the connection status
 		if err := r.Status().Update(ctx, latestConnection); err != nil {
diff --git a/internal/scheduler/naive.go b/internal/scheduler/naive.go
index 0192173..2af423e 100644
--- a/internal/scheduler/naive.go
+++ b/internal/scheduler/naive.go
@@ -60,7 +60,7 @@ func (s *NaiveScheduler) OnDelete(node *tfv1.GPUNode) {
 }
 
 // Release implements Scheduler interface
-func (s *NaiveScheduler) Release(node *tfv1.GPUNode) error {
+func (s *NaiveScheduler) Release(request tfv1.Resource, node *tfv1.GPUNode) error {
 	s.Lock()
 	defer s.Unlock()
 
@@ -69,7 +69,10 @@ func (s *NaiveScheduler) Release(node *tfv1.GPUNode) error {
 		return fmt.Errorf("node %s not found", node.Name)
 	}
 
-	// Reset the node's available resources to its capacity
-	existingNode.Status.Available = existingNode.Status.Capacity
+	// Add back the released resources
+	existingNode.Status.Available.Tflops.Add(request.Tflops)
+	existingNode.Status.Available.Vram.Add(request.Vram)
+	// output the updated node
+	node.Status.Available = existingNode.Status.Available
 	return nil
 }
diff --git a/internal/scheduler/naive_test.go b/internal/scheduler/naive_test.go
index 823f8c4..3cdfea4 100644
--- a/internal/scheduler/naive_test.go
+++ b/internal/scheduler/naive_test.go
@@ -168,14 +168,18 @@ func TestNaiveScheduler_NodeOperations(t *testing.T) {
 
 func TestNaiveScheduler_Release(t *testing.T) {
 	tests := []struct {
-		name      string
-		node      *tfv1.GPUNode
-		schedule  *tfv1.Resource
-		wantError bool
+		name                string
+		node               *tfv1.GPUNode
+		schedule           *tfv1.Resource
+		release            *tfv1.Resource
+		wantError          bool
+		wantRemainingTflops string
+		wantRemainingVram   string
 	}{
 		{
 			name:      "release non-existent node",
 			node:      createGPUNode("node1", "100", "16Gi"),
+			release:   &tfv1.Resource{},
 			wantError: true,
 		},
 		{
@@ -199,7 +203,42 @@ func TestNaiveScheduler_Release(t *testing.T) {
 				Tflops: resource.MustParse("50"),
 				Vram:   resource.MustParse("8Gi"),
 			},
-			wantError: false,
+			release: &tfv1.Resource{
+				Tflops: resource.MustParse("50"),
+				Vram:   resource.MustParse("8Gi"),
+			},
+			wantError:          false,
+			wantRemainingTflops: "100",
+			wantRemainingVram:   "16Gi",
+		},
+		{
+			name: "partial release",
+			node: &tfv1.GPUNode{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "node1",
+				},
+				Status: tfv1.GPUNodeStatus{
+					Capacity: tfv1.Resource{
+						Tflops: resource.MustParse("100"),
+						Vram:   resource.MustParse("16Gi"),
+					},
+					Available: tfv1.Resource{
+						Tflops: resource.MustParse("100"),
+						Vram:   resource.MustParse("16Gi"),
+					},
+				},
+			},
+			schedule: &tfv1.Resource{
+				Tflops: resource.MustParse("60"),
+				Vram:   resource.MustParse("10Gi"),
+			},
+			release: &tfv1.Resource{
+				Tflops: resource.MustParse("30"),
+				Vram:   resource.MustParse("5Gi"),
+			},
+			wantError:          false,
+			wantRemainingTflops: "70",
+			wantRemainingVram:   "11Gi",
 		},
 	}
 
@@ -220,26 +259,34 @@ func TestNaiveScheduler_Release(t *testing.T) {
 					}
 
 					// Verify resources were allocated
-					if node.Status.Available.Tflops.Cmp(resource.MustParse("50")) != 0 ||
-						node.Status.Available.Vram.Cmp(resource.MustParse("8Gi")) != 0 {
+					expectedTflops := tt.node.Status.Capacity.Tflops.DeepCopy()
+					expectedVram := tt.node.Status.Capacity.Vram.DeepCopy()
+					expectedTflops.Sub(tt.schedule.Tflops)
+					expectedVram.Sub(tt.schedule.Vram)
+					if node.Status.Available.Tflops.Cmp(expectedTflops) != 0 ||
+						node.Status.Available.Vram.Cmp(expectedVram) != 0 {
 						t.Errorf("Schedule() did not allocate resources correctly")
 						return
 					}
 				}
 			}
 
-			err := s.Release(tt.node)
+			err := s.Release(*tt.release, tt.node)
 			if (err != nil) != tt.wantError {
 				t.Errorf("Release() error = %v, wantError %v", err, tt.wantError)
 				return
 			}
 
 			if !tt.wantError {
-				// Verify resources were restored
+				// Verify resources were restored correctly
 				node := s.nodes[tt.node.Name]
-				if node.Status.Available.Tflops.Cmp(node.Status.Capacity.Tflops) != 0 ||
-					node.Status.Available.Vram.Cmp(node.Status.Capacity.Vram) != 0 {
-					t.Errorf("Release() did not restore resources correctly")
+				if node.Status.Available.Tflops.String() != tt.wantRemainingTflops ||
+					node.Status.Available.Vram.String() != tt.wantRemainingVram {
+					t.Errorf("Release() resources incorrect, got tflops=%v vram=%v, want tflops=%v vram=%v",
+						node.Status.Available.Tflops.String(),
+						node.Status.Available.Vram.String(),
+						tt.wantRemainingTflops,
+						tt.wantRemainingVram)
 				}
 			}
 		})
diff --git a/internal/scheduler/scheduler.go b/internal/scheduler/scheduler.go
index 163124c..bd18893 100644
--- a/internal/scheduler/scheduler.go
+++ b/internal/scheduler/scheduler.go
@@ -12,7 +12,7 @@ type Scheduler interface {
 	Schedule(request tfv1.Resource) (*tfv1.GPUNode, error)
 
 	// Release frees the allocated resources of a node
-	Release(node *tfv1.GPUNode) error
+	Release(request tfv1.Resource, node *tfv1.GPUNode) error
 
 	// OnAdd is called when a new node is added
 	OnAdd(node *tfv1.GPUNode)
diff --git a/internal/webhook/v1/pod_webhook_test.go b/internal/webhook/v1/pod_webhook_test.go
index 44e1981..87eaa79 100644
--- a/internal/webhook/v1/pod_webhook_test.go
+++ b/internal/webhook/v1/pod_webhook_test.go
@@ -17,42 +17,165 @@ limitations under the License.
 package v1
 
 import (
+	"context"
+	"encoding/json"
+	"net/http"
+
+	tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
+	"github.com/NexusGPU/tensor-fusion-operator/internal/config"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
-
+	admissionv1 "k8s.io/api/admission/v1"
 	corev1 "k8s.io/api/core/v1"
-	// TODO (user): Add any additional imports if needed
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/runtime"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+	"sigs.k8s.io/controller-runtime/pkg/client/fake"
+	"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
 )
 
-var _ = Describe("Pod Webhook", func() {
+var _ = Describe("TensorFusionPodMutator", func() {
 	var (
-		obj    *corev1.Pod
-		oldObj *corev1.Pod
+		mutator *TensorFusionPodMutator
+		ctx     context.Context
+		scheme  *runtime.Scheme
+		decoder admission.Decoder
+		client  client.Client
 	)
 
 	BeforeEach(func() {
-		obj = &corev1.Pod{}
-		oldObj = &corev1.Pod{}
-		Expect(oldObj).NotTo(BeNil(), "Expected oldObj to be initialized")
-		Expect(obj).NotTo(BeNil(), "Expected obj to be initialized")
-		// TODO (user): Add any setup logic common to all tests
-	})
+		ctx = context.Background()
+		scheme = runtime.NewScheme()
+		Expect(corev1.AddToScheme(scheme)).To(Succeed())
+		Expect(tfv1.AddToScheme(scheme)).To(Succeed())
+
+		decoder = admission.NewDecoder(scheme)
+		client = fake.NewClientBuilder().WithScheme(scheme).Build()
 
-	AfterEach(func() {
-		// TODO (user): Add any teardown logic common to all tests
+		config := config.NewDefaultConfig()
+		mutator = &TensorFusionPodMutator{
+			Client: client,
+			Config: &config.PodMutator,
+		}
+		Expect(mutator.InjectDecoder(decoder)).To(Succeed())
 	})
 
-	Context("When creating Pod under Defaulting Webhook", func() {
-		// TODO (user): Add logic for defaulting webhooks
-		// Example:
-		// It("Should apply defaults when a required field is empty", func() {
-		//     By("simulating a scenario where defaults should be applied")
-		//     obj.SomeFieldWithDefault = ""
-		//     By("calling the Default method to apply defaults")
-		//     defaulter.Default(ctx, obj)
-		//     By("checking that the default values are set")
-		//     Expect(obj.SomeFieldWithDefault).To(Equal("default_value"))
-		// })
+	Context("Handle", func() {
+		It("should successfully mutate a pod with TF requirements", func() {
+			pod := &corev1.Pod{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "test-pod",
+					Namespace: "default",
+					Annotations: map[string]string{
+						"tf.nexusgpu.com/tflops": "100",
+						"tf.nexusgpu.com/vram":   "16Gi",
+					},
+				},
+				Spec: corev1.PodSpec{
+					Containers: []corev1.Container{
+						{
+							Name:  "main",
+							Image: "test-image",
+						},
+					},
+				},
+			}
+
+			podBytes, err := json.Marshal(pod)
+			Expect(err).NotTo(HaveOccurred())
+
+			req := admission.Request{
+				AdmissionRequest: admissionv1.AdmissionRequest{
+					Object: runtime.RawExtension{
+						Raw: podBytes,
+					},
+					Operation: admissionv1.Create,
+				},
+			}
+
+			resp := mutator.Handle(ctx, req)
+			Expect(resp.Allowed).To(BeTrue())
+			Expect(resp.Patches).NotTo(BeEmpty())
+
+			// Verify TensorFusionConnection was created
+			tfConnList := &tfv1.TensorFusionConnectionList{}
+			err = client.List(ctx, tfConnList)
+			Expect(err).NotTo(HaveOccurred())
+			Expect(tfConnList.Items).To(HaveLen(1))
+		})
+
+		It("should handle pods without TF requirements", func() {
+			pod := &corev1.Pod{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "test-pod-no-tf",
+					Namespace: "default",
+				},
+				Spec: corev1.PodSpec{
+					Containers: []corev1.Container{
+						{
+							Name:  "main",
+							Image: "test-image",
+						},
+					},
+				},
+			}
+
+			podBytes, err := json.Marshal(pod)
+			Expect(err).NotTo(HaveOccurred())
+
+			req := admission.Request{
+				AdmissionRequest: admissionv1.AdmissionRequest{
+					Object: runtime.RawExtension{
+						Raw: podBytes,
+					},
+					Operation: admissionv1.Create,
+				},
+			}
+
+			resp := mutator.Handle(ctx, req)
+			Expect(resp.Allowed).To(BeTrue())
+			Expect(resp.Patches).To(BeEmpty())
+		})
+
+		It("should handle invalid pod specification", func() {
+			req := admission.Request{
+				AdmissionRequest: admissionv1.AdmissionRequest{
+					Object: runtime.RawExtension{
+						Raw: []byte("invalid json"),
+					},
+					Operation: admissionv1.Create,
+				},
+			}
+
+			resp := mutator.Handle(ctx, req)
+			Expect(resp.Allowed).To(BeFalse())
+			Expect(resp.Result.Code).To(Equal(int32(http.StatusBadRequest)))
+		})
 	})
 
+	Context("parseTFReq", func() {
+		It("should correctly parse TF requirements from pod annotations", func() {
+			pod := &corev1.Pod{
+				ObjectMeta: metav1.ObjectMeta{
+					Annotations: map[string]string{
+						"tf.nexusgpu.com/tflops": "100",
+						"tf.nexusgpu.com/vram":   "16Gi",
+					},
+				},
+				Spec: corev1.PodSpec{
+					Containers: []corev1.Container{
+						{
+							Name: "test-container",
+						},
+					},
+				},
+			}
+
+			reqs := parseTFReq(pod)
+			Expect(reqs).To(HaveLen(1))
+			Expect(reqs[0].ContainerName).To(Equal("test-container"))
+			Expect(reqs[0].Tflops.String()).To(Equal("100"))
+			Expect(reqs[0].Vram.String()).To(Equal("16Gi"))
+		})
+	})
 })

From 107f4f987b8c1773da1db326e653af81b9d4b789 Mon Sep 17 00:00:00 2001
From: 0x5457 <0x5457@protonmail.com>
Date: Fri, 6 Dec 2024 08:31:23 +0000
Subject: [PATCH 16/22] feat(server): add connection URL response and logging
 middleware

- Add direct URL response for running connections
- Enable gin logging middleware for better request tracking
- Improve connection status handling
---
 internal/server/router/connection.go | 5 +++++
 internal/server/server.go            | 2 ++
 2 files changed, 7 insertions(+)

diff --git a/internal/server/router/connection.go b/internal/server/router/connection.go
index 1abad0e..678c685 100644
--- a/internal/server/router/connection.go
+++ b/internal/server/router/connection.go
@@ -35,6 +35,11 @@ func (cr *ConnectionRouter) Get(ctx *gin.Context) {
 		return
 	}
 
+	if conn.Status.Phase == tfv1.TensorFusionConnectionRunning {
+		ctx.JSON(200, conn.Status.ConnectionURL)
+		return
+	}
+
 	// Subscribe to connection updates
 	ch, cancelFunc := cr.watcher.subscribe(req)
 	defer cancelFunc()
diff --git a/internal/server/server.go b/internal/server/server.go
index fa2995c..3131b47 100644
--- a/internal/server/server.go
+++ b/internal/server/server.go
@@ -9,9 +9,11 @@ import (
 func NewHTTPServer(
 	cr *router.ConnectionRouter,
 ) *gin.Engine {
+
 	r := gin.New()
 	r.Use(gzip.Gzip(gzip.DefaultCompression))
 	r.Use(gin.Recovery())
+	r.Use(gin.Logger())
 
 	apiGroup := r.Group("/api")
 	apiGroup.GET("/connection", cr.Get)

From 345580f700a41b35ce90f63981a84b2969097e07 Mon Sep 17 00:00:00 2001
From: 0x5457 <0x5457@protonmail.com>
Date: Fri, 6 Dec 2024 08:49:58 +0000
Subject: [PATCH 17/22] feat: change Gpunode to clusterlevel resource

---
 PROJECT                                                  | 1 -
 api/v1/gpunode_types.go                                  | 2 +-
 config/crd/bases/tensor-fusion.ai_gpunodes.yaml          | 2 +-
 config/samples/v1_gpunode.yaml                           | 1 -
 internal/controller/tensorfusionconnection_controller.go | 2 +-
 5 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/PROJECT b/PROJECT
index cc6a157..9104511 100644
--- a/PROJECT
+++ b/PROJECT
@@ -18,7 +18,6 @@ resources:
   version: v1
 - api:
     crdVersion: v1
-    namespaced: true
   controller: true
   domain: tensor-fusion.ai
   kind: GPUNode
diff --git a/api/v1/gpunode_types.go b/api/v1/gpunode_types.go
index 6a752ba..18ab7f1 100644
--- a/api/v1/gpunode_types.go
+++ b/api/v1/gpunode_types.go
@@ -29,7 +29,7 @@ type GPUNodeStatus struct {
 
 // +kubebuilder:object:root=true
 // +kubebuilder:subresource:status
-
+// +kubebuilder:resource:scope=Cluster
 // GPUNode is the Schema for the gpunodes API.
 type GPUNode struct {
 	metav1.TypeMeta   `json:",inline"`
diff --git a/config/crd/bases/tensor-fusion.ai_gpunodes.yaml b/config/crd/bases/tensor-fusion.ai_gpunodes.yaml
index 34442aa..bb62054 100644
--- a/config/crd/bases/tensor-fusion.ai_gpunodes.yaml
+++ b/config/crd/bases/tensor-fusion.ai_gpunodes.yaml
@@ -12,7 +12,7 @@ spec:
     listKind: GPUNodeList
     plural: gpunodes
     singular: gpunode
-  scope: Namespaced
+  scope: Cluster
   versions:
   - name: v1
     schema:
diff --git a/config/samples/v1_gpunode.yaml b/config/samples/v1_gpunode.yaml
index 484525d..84a50cb 100644
--- a/config/samples/v1_gpunode.yaml
+++ b/config/samples/v1_gpunode.yaml
@@ -5,7 +5,6 @@ metadata:
     app.kubernetes.io/name: tensor-fusion-operator
     app.kubernetes.io/managed-by: kustomize
   name: gpunode-sample
-  namespace: tensor-fusion
 status:
   capacity:
     tflops: '200'
diff --git a/internal/controller/tensorfusionconnection_controller.go b/internal/controller/tensorfusionconnection_controller.go
index 47c6397..e631a55 100644
--- a/internal/controller/tensorfusionconnection_controller.go
+++ b/internal/controller/tensorfusionconnection_controller.go
@@ -129,7 +129,7 @@ func (r *TensorFusionConnectionReconciler) handleDeletion(ctx context.Context, c
 
 	// Get the node
 	node := &tfv1.GPUNode{}
-	if err := r.Get(ctx, client.ObjectKey{Name: connection.Status.Node, Namespace: connection.Namespace}, node); err != nil {
+	if err := r.Get(ctx, client.ObjectKey{Name: connection.Status.Node}, node); err != nil {
 		if errors.IsNotFound(err) {
 			// Node is already gone, nothing to do
 			return nil

From 2935d3dded975825895fba062f0c2521bb094ae6 Mon Sep 17 00:00:00 2001
From: 0x5457 <0x5457@protonmail.com>
Date: Mon, 9 Dec 2024 07:35:52 +0000
Subject: [PATCH 18/22] chore: change Resource struct field names from
 Request/Limit to Requests/Limits to align with Kubernetes resource naming
 conventions.

---
 api/v1/tensorfusionconnection_types.go                   | 4 ++--
 api/v1/zz_generated.deepcopy.go                          | 4 ++--
 internal/controller/tensorfusionconnection_controller.go | 4 ++--
 internal/webhook/v1/pod_webhook.go                       | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/api/v1/tensorfusionconnection_types.go b/api/v1/tensorfusionconnection_types.go
index 5594e92..c79bfa4 100644
--- a/api/v1/tensorfusionconnection_types.go
+++ b/api/v1/tensorfusionconnection_types.go
@@ -27,8 +27,8 @@ type Resource struct {
 }
 
 type Resources struct {
-	Request Resource `json:"request"`
-	Limit   Resource `json:"limit"`
+	Requests Resource `json:"requests"`
+	Limits   Resource `json:"limits"`
 }
 
 // TensorFusionConnectionSpec defines the desired state of TensorFusionConnection.
diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go
index d899ad7..8ff9bbe 100644
--- a/api/v1/zz_generated.deepcopy.go
+++ b/api/v1/zz_generated.deepcopy.go
@@ -124,8 +124,8 @@ func (in *Resource) DeepCopy() *Resource {
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *Resources) DeepCopyInto(out *Resources) {
 	*out = *in
-	in.Request.DeepCopyInto(&out.Request)
-	in.Limit.DeepCopyInto(&out.Limit)
+	in.Requests.DeepCopyInto(&out.Requests)
+	in.Limits.DeepCopyInto(&out.Limits)
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Resources.
diff --git a/internal/controller/tensorfusionconnection_controller.go b/internal/controller/tensorfusionconnection_controller.go
index e631a55..65937b1 100644
--- a/internal/controller/tensorfusionconnection_controller.go
+++ b/internal/controller/tensorfusionconnection_controller.go
@@ -97,7 +97,7 @@ func (r *TensorFusionConnectionReconciler) Reconcile(ctx context.Context, req ct
 	if connection.Status.Phase == "" || connection.Status.Phase == tfv1.TensorFusionConnectionPending {
 		// Try to get an available node from scheduler
 		var err error
-		node, err = r.Scheduler.Schedule(connection.Spec.Resources.Request)
+		node, err = r.Scheduler.Schedule(connection.Spec.Resources.Requests)
 		if err != nil {
 			log.Info(err.Error())
 			connection.Status.Phase = tfv1.TensorFusionConnectionPending
@@ -138,7 +138,7 @@ func (r *TensorFusionConnectionReconciler) handleDeletion(ctx context.Context, c
 	}
 
 	// Release the resources
-	if err := r.Scheduler.Release(connection.Spec.Resources.Request, node); err != nil {
+	if err := r.Scheduler.Release(connection.Spec.Resources.Requests, node); err != nil {
 		return err
 	}
 
diff --git a/internal/webhook/v1/pod_webhook.go b/internal/webhook/v1/pod_webhook.go
index 8edcb70..2b9bfcf 100644
--- a/internal/webhook/v1/pod_webhook.go
+++ b/internal/webhook/v1/pod_webhook.go
@@ -207,11 +207,11 @@ func generateTensorFusionConnection(pod *corev1.Pod, tfReq []TFReq) []*tfv1.Tens
 			},
 			Spec: tfv1.TensorFusionConnectionSpec{
 				Resources: tfv1.Resources{
-					Request: tfv1.Resource{
+					Requests: tfv1.Resource{
 						Tflops: req.Tflops,
 						Vram:   req.Vram,
 					},
-					Limit: tfv1.Resource{
+					Limits: tfv1.Resource{
 						Tflops: req.Tflops,
 						Vram:   req.Vram,
 					},

From ade38f8781cbb88282f528c72d15b63b974172f0 Mon Sep 17 00:00:00 2001
From: 0x5457 <0x5457@protonmail.com>
Date: Mon, 9 Dec 2024 07:50:13 +0000
Subject: [PATCH 19/22] feat: rename from 'GPUNode' to 'GPU'

---
 PROJECT                                       |   2 +-
 api/v1/{gpunode_types.go => gpu_types.go}     |  19 ++-
 api/v1/tensorfusionconnection_types.go        |   2 +-
 api/v1/zz_generated.deepcopy.go               |  35 ++---
 cmd/main.go                                   |   4 +-
 ...unodes.yaml => tensor-fusion.ai_gpus.yaml} |  19 +--
 ...sor-fusion.ai_tensorfusionconnections.yaml |   8 +-
 config/crd/kustomization.yaml                 |   2 +-
 ..._editor_role.yaml => gpu_editor_role.yaml} |   8 +-
 ..._viewer_role.yaml => gpu_viewer_role.yaml} |   8 +-
 config/rbac/kustomization.yaml                |   4 +-
 config/rbac/role.yaml                         |   6 +-
 config/samples/kustomization.yaml             |   2 +-
 .../samples/{v1_gpunode.yaml => v1_gpu.yaml}  |   4 +-
 config/samples/v1_tensorfusionconnection.yaml |   4 +-
 ...punode_controller.go => gpu_controller.go} |  24 ++--
 ...troller_test.go => gpu_controller_test.go} |  19 ++-
 .../tensorfusionconnection_controller.go      |  50 +++----
 internal/scheduler/naive.go                   |  52 +++----
 internal/scheduler/naive_test.go              | 128 +++++++++---------
 internal/scheduler/scheduler.go               |  24 ++--
 internal/worker/worker.go                     |   2 +-
 22 files changed, 206 insertions(+), 220 deletions(-)
 rename api/v1/{gpunode_types.go => gpu_types.go} (71%)
 rename config/crd/bases/{tensor-fusion.ai_gpunodes.yaml => tensor-fusion.ai_gpus.yaml} (88%)
 rename config/rbac/{gpunode_editor_role.yaml => gpu_editor_role.yaml} (77%)
 rename config/rbac/{gpunode_viewer_role.yaml => gpu_viewer_role.yaml} (75%)
 rename config/samples/{v1_gpunode.yaml => v1_gpu.yaml} (87%)
 rename internal/controller/{gpunode_controller.go => gpu_controller.go} (67%)
 rename internal/controller/{gpunode_controller_test.go => gpu_controller_test.go} (84%)

diff --git a/PROJECT b/PROJECT
index 9104511..dfc0413 100644
--- a/PROJECT
+++ b/PROJECT
@@ -20,7 +20,7 @@ resources:
     crdVersion: v1
   controller: true
   domain: tensor-fusion.ai
-  kind: GPUNode
+  kind: GPU
   path: github.com/NexusGPU/tensor-fusion-operator/api/v1
   version: v1
 - core: true
diff --git a/api/v1/gpunode_types.go b/api/v1/gpu_types.go
similarity index 71%
rename from api/v1/gpunode_types.go
rename to api/v1/gpu_types.go
index 18ab7f1..9743024 100644
--- a/api/v1/gpunode_types.go
+++ b/api/v1/gpu_types.go
@@ -20,33 +20,32 @@ import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 )
 
-// GPUNodeStatus defines the observed state of GPUNode.
-type GPUNodeStatus struct {
+// GPUStatus defines the observed state of GPU.
+type GPUStatus struct {
 	Capacity  Resource `json:"capacity"`
 	Available Resource `json:"available"`
-	Devices   []string `json:"devices"`
 }
 
 // +kubebuilder:object:root=true
 // +kubebuilder:subresource:status
 // +kubebuilder:resource:scope=Cluster
-// GPUNode is the Schema for the gpunodes API.
-type GPUNode struct {
+// GPU is the Schema for the gpus API.
+type GPU struct {
 	metav1.TypeMeta   `json:",inline"`
 	metav1.ObjectMeta `json:"metadata,omitempty"`
 
-	Status GPUNodeStatus `json:"status,omitempty"`
+	Status GPUStatus `json:"status,omitempty"`
 }
 
 // +kubebuilder:object:root=true
 
-// GPUNodeList contains a list of GPUNode.
-type GPUNodeList struct {
+// GPUList contains a list of GPU.
+type GPUList struct {
 	metav1.TypeMeta `json:",inline"`
 	metav1.ListMeta `json:"metadata,omitempty"`
-	Items           []GPUNode `json:"items"`
+	Items           []GPU `json:"items"`
 }
 
 func init() {
-	SchemeBuilder.Register(&GPUNode{}, &GPUNodeList{})
+	SchemeBuilder.Register(&GPU{}, &GPUList{})
 }
diff --git a/api/v1/tensorfusionconnection_types.go b/api/v1/tensorfusionconnection_types.go
index c79bfa4..c14eb66 100644
--- a/api/v1/tensorfusionconnection_types.go
+++ b/api/v1/tensorfusionconnection_types.go
@@ -49,7 +49,7 @@ type TensorFusionConnectionStatus struct {
 	Phase         TensorFusionConnectionPhase `json:"phase"`
 	ConnectionURL string                      `json:"connectionURL"`
 	QosClass      string                      `json:"qosClass,omitempty"`
-	Node          string                      `json:"node,omitempty"`
+	GPU           string                      `json:"gpu,omitempty"`
 }
 
 // +kubebuilder:object:root=true
diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go
index 8ff9bbe..cf117f0 100644
--- a/api/v1/zz_generated.deepcopy.go
+++ b/api/v1/zz_generated.deepcopy.go
@@ -25,25 +25,25 @@ import (
 )
 
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
-func (in *GPUNode) DeepCopyInto(out *GPUNode) {
+func (in *GPU) DeepCopyInto(out *GPU) {
 	*out = *in
 	out.TypeMeta = in.TypeMeta
 	in.ObjectMeta.DeepCopyInto(&out.ObjectMeta)
 	in.Status.DeepCopyInto(&out.Status)
 }
 
-// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNode.
-func (in *GPUNode) DeepCopy() *GPUNode {
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPU.
+func (in *GPU) DeepCopy() *GPU {
 	if in == nil {
 		return nil
 	}
-	out := new(GPUNode)
+	out := new(GPU)
 	in.DeepCopyInto(out)
 	return out
 }
 
 // DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
-func (in *GPUNode) DeepCopyObject() runtime.Object {
+func (in *GPU) DeepCopyObject() runtime.Object {
 	if c := in.DeepCopy(); c != nil {
 		return c
 	}
@@ -51,31 +51,31 @@ func (in *GPUNode) DeepCopyObject() runtime.Object {
 }
 
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
-func (in *GPUNodeList) DeepCopyInto(out *GPUNodeList) {
+func (in *GPUList) DeepCopyInto(out *GPUList) {
 	*out = *in
 	out.TypeMeta = in.TypeMeta
 	in.ListMeta.DeepCopyInto(&out.ListMeta)
 	if in.Items != nil {
 		in, out := &in.Items, &out.Items
-		*out = make([]GPUNode, len(*in))
+		*out = make([]GPU, len(*in))
 		for i := range *in {
 			(*in)[i].DeepCopyInto(&(*out)[i])
 		}
 	}
 }
 
-// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeList.
-func (in *GPUNodeList) DeepCopy() *GPUNodeList {
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUList.
+func (in *GPUList) DeepCopy() *GPUList {
 	if in == nil {
 		return nil
 	}
-	out := new(GPUNodeList)
+	out := new(GPUList)
 	in.DeepCopyInto(out)
 	return out
 }
 
 // DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
-func (in *GPUNodeList) DeepCopyObject() runtime.Object {
+func (in *GPUList) DeepCopyObject() runtime.Object {
 	if c := in.DeepCopy(); c != nil {
 		return c
 	}
@@ -83,23 +83,18 @@ func (in *GPUNodeList) DeepCopyObject() runtime.Object {
 }
 
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
-func (in *GPUNodeStatus) DeepCopyInto(out *GPUNodeStatus) {
+func (in *GPUStatus) DeepCopyInto(out *GPUStatus) {
 	*out = *in
 	in.Capacity.DeepCopyInto(&out.Capacity)
 	in.Available.DeepCopyInto(&out.Available)
-	if in.Devices != nil {
-		in, out := &in.Devices, &out.Devices
-		*out = make([]string, len(*in))
-		copy(*out, *in)
-	}
 }
 
-// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUNodeStatus.
-func (in *GPUNodeStatus) DeepCopy() *GPUNodeStatus {
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUStatus.
+func (in *GPUStatus) DeepCopy() *GPUStatus {
 	if in == nil {
 		return nil
 	}
-	out := new(GPUNodeStatus)
+	out := new(GPUStatus)
 	in.DeepCopyInto(out)
 	return out
 }
diff --git a/cmd/main.go b/cmd/main.go
index eb3e6a7..47055fe 100644
--- a/cmd/main.go
+++ b/cmd/main.go
@@ -162,12 +162,12 @@ func main() {
 		os.Exit(1)
 	}
 
-	if err = (&controller.GPUNodeReconciler{
+	if err = (&controller.GPUReconciler{
 		Client:    mgr.GetClient(),
 		Scheme:    mgr.GetScheme(),
 		Scheduler: scheduler,
 	}).SetupWithManager(ctx, mgr); err != nil {
-		setupLog.Error(err, "unable to create controller", "controller", "GPUNode")
+		setupLog.Error(err, "unable to create controller", "controller", "GPU")
 		os.Exit(1)
 	}
 
diff --git a/config/crd/bases/tensor-fusion.ai_gpunodes.yaml b/config/crd/bases/tensor-fusion.ai_gpus.yaml
similarity index 88%
rename from config/crd/bases/tensor-fusion.ai_gpunodes.yaml
rename to config/crd/bases/tensor-fusion.ai_gpus.yaml
index bb62054..38ee9de 100644
--- a/config/crd/bases/tensor-fusion.ai_gpunodes.yaml
+++ b/config/crd/bases/tensor-fusion.ai_gpus.yaml
@@ -4,20 +4,20 @@ kind: CustomResourceDefinition
 metadata:
   annotations:
     controller-gen.kubebuilder.io/version: v0.16.4
-  name: gpunodes.tensor-fusion.ai
+  name: gpus.tensor-fusion.ai
 spec:
   group: tensor-fusion.ai
   names:
-    kind: GPUNode
-    listKind: GPUNodeList
-    plural: gpunodes
-    singular: gpunode
+    kind: GPU
+    listKind: GPUList
+    plural: gpus
+    singular: gpu
   scope: Cluster
   versions:
   - name: v1
     schema:
       openAPIV3Schema:
-        description: GPUNode is the Schema for the gpunodes API.
+        description: GPU is the Schema for the gpus API.
         properties:
           apiVersion:
             description: |-
@@ -37,7 +37,7 @@ spec:
           metadata:
             type: object
           status:
-            description: GPUNodeStatus defines the observed state of GPUNode.
+            description: GPUStatus defines the observed state of GPU.
             properties:
               available:
                 properties:
@@ -75,14 +75,9 @@ spec:
                 - tflops
                 - vram
                 type: object
-              devices:
-                items:
-                  type: string
-                type: array
             required:
             - available
             - capacity
-            - devices
             type: object
         type: object
     served: true
diff --git a/config/crd/bases/tensor-fusion.ai_tensorfusionconnections.yaml b/config/crd/bases/tensor-fusion.ai_tensorfusionconnections.yaml
index 43407d0..7b2b288 100644
--- a/config/crd/bases/tensor-fusion.ai_tensorfusionconnections.yaml
+++ b/config/crd/bases/tensor-fusion.ai_tensorfusionconnections.yaml
@@ -42,7 +42,7 @@ spec:
             properties:
               resources:
                 properties:
-                  limit:
+                  limits:
                     properties:
                       tflops:
                         anyOf:
@@ -60,7 +60,7 @@ spec:
                     - tflops
                     - vram
                     type: object
-                  request:
+                  requests:
                     properties:
                       tflops:
                         anyOf:
@@ -79,8 +79,8 @@ spec:
                     - vram
                     type: object
                 required:
-                - limit
-                - request
+                - limits
+                - requests
                 type: object
             required:
             - resources
diff --git a/config/crd/kustomization.yaml b/config/crd/kustomization.yaml
index ef965fc..7532d6a 100644
--- a/config/crd/kustomization.yaml
+++ b/config/crd/kustomization.yaml
@@ -3,7 +3,7 @@
 # It should be run by config/default
 resources:
 - bases/tensor-fusion.ai_tensorfusionconnections.yaml
-- bases/tensor-fusion.ai_gpunodes.yaml
+- bases/tensor-fusion.ai_gpus.yaml
 # +kubebuilder:scaffold:crdkustomizeresource
 
 patches:
diff --git a/config/rbac/gpunode_editor_role.yaml b/config/rbac/gpu_editor_role.yaml
similarity index 77%
rename from config/rbac/gpunode_editor_role.yaml
rename to config/rbac/gpu_editor_role.yaml
index 10e6ec1..e512234 100644
--- a/config/rbac/gpunode_editor_role.yaml
+++ b/config/rbac/gpu_editor_role.yaml
@@ -1,16 +1,16 @@
-# permissions for end users to edit gpunodes.
+# permissions for end users to edit gpus.
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
 metadata:
   labels:
     app.kubernetes.io/name: tensor-fusion-operator
     app.kubernetes.io/managed-by: kustomize
-  name: gpunode-editor-role
+  name: gpu-editor-role
 rules:
 - apiGroups:
   - tensor-fusion.ai
   resources:
-  - gpunodes
+  - gpus
   verbs:
   - create
   - delete
@@ -22,6 +22,6 @@ rules:
 - apiGroups:
   - tensor-fusion.ai
   resources:
-  - gpunodes/status
+  - gpus/status
   verbs:
   - get
diff --git a/config/rbac/gpunode_viewer_role.yaml b/config/rbac/gpu_viewer_role.yaml
similarity index 75%
rename from config/rbac/gpunode_viewer_role.yaml
rename to config/rbac/gpu_viewer_role.yaml
index 376b12f..0a45f0c 100644
--- a/config/rbac/gpunode_viewer_role.yaml
+++ b/config/rbac/gpu_viewer_role.yaml
@@ -1,16 +1,16 @@
-# permissions for end users to view gpunodes.
+# permissions for end users to view gpus.
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
 metadata:
   labels:
     app.kubernetes.io/name: tensor-fusion-operator
     app.kubernetes.io/managed-by: kustomize
-  name: gpunode-viewer-role
+  name: gpu-viewer-role
 rules:
 - apiGroups:
   - tensor-fusion.ai
   resources:
-  - gpunodes
+  - gpus
   verbs:
   - get
   - list
@@ -18,6 +18,6 @@ rules:
 - apiGroups:
   - tensor-fusion.ai
   resources:
-  - gpunodes/status
+  - gpus/status
   verbs:
   - get
diff --git a/config/rbac/kustomization.yaml b/config/rbac/kustomization.yaml
index 0bb7cfe..c737ba7 100644
--- a/config/rbac/kustomization.yaml
+++ b/config/rbac/kustomization.yaml
@@ -22,8 +22,8 @@ resources:
 # default, aiding admins in cluster management. Those roles are
 # not used by the Project itself. You can comment the following lines
 # if you do not want those helpers be installed with your Project.
-- gpunode_editor_role.yaml
-- gpunode_viewer_role.yaml
+- gpu_editor_role.yaml
+- gpu_viewer_role.yaml
 - tensorfusionconnection_editor_role.yaml
 - tensorfusionconnection_viewer_role.yaml
 
diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml
index a2a838e..95a7559 100644
--- a/config/rbac/role.yaml
+++ b/config/rbac/role.yaml
@@ -7,7 +7,7 @@ rules:
 - apiGroups:
   - tensor-fusion.ai
   resources:
-  - gpunodes
+  - gpus
   - tensorfusionconnections
   verbs:
   - create
@@ -20,14 +20,14 @@ rules:
 - apiGroups:
   - tensor-fusion.ai
   resources:
-  - gpunodes/finalizers
+  - gpus/finalizers
   - tensorfusionconnections/finalizers
   verbs:
   - update
 - apiGroups:
   - tensor-fusion.ai
   resources:
-  - gpunodes/status
+  - gpus/status
   - tensorfusionconnections/status
   verbs:
   - get
diff --git a/config/samples/kustomization.yaml b/config/samples/kustomization.yaml
index 387e950..7b7b116 100644
--- a/config/samples/kustomization.yaml
+++ b/config/samples/kustomization.yaml
@@ -1,5 +1,5 @@
 ## Append samples of your project ##
 resources:
 - v1_tensorfusionconnection.yaml
-- v1_gpunode.yaml
+- v1_gpu.yaml
 # +kubebuilder:scaffold:manifestskustomizesamples
diff --git a/config/samples/v1_gpunode.yaml b/config/samples/v1_gpu.yaml
similarity index 87%
rename from config/samples/v1_gpunode.yaml
rename to config/samples/v1_gpu.yaml
index 84a50cb..984bfb5 100644
--- a/config/samples/v1_gpunode.yaml
+++ b/config/samples/v1_gpu.yaml
@@ -1,10 +1,10 @@
 apiVersion: tensor-fusion.ai/v1
-kind: GPUNode
+kind: GPU
 metadata:
   labels:
     app.kubernetes.io/name: tensor-fusion-operator
     app.kubernetes.io/managed-by: kustomize
-  name: gpunode-sample
+  name: gpu-sample
 status:
   capacity:
     tflops: '200'
diff --git a/config/samples/v1_tensorfusionconnection.yaml b/config/samples/v1_tensorfusionconnection.yaml
index cc634ff..bf1072f 100644
--- a/config/samples/v1_tensorfusionconnection.yaml
+++ b/config/samples/v1_tensorfusionconnection.yaml
@@ -8,9 +8,9 @@ metadata:
   namespace: tensor-fusion
 spec:
   resources:
-    limit:
+    limits:
       tflops: '100'
       vram: 8Gi
-    request:
+    requests:
       tflops: '20'
       vram: 9Gi
diff --git a/internal/controller/gpunode_controller.go b/internal/controller/gpu_controller.go
similarity index 67%
rename from internal/controller/gpunode_controller.go
rename to internal/controller/gpu_controller.go
index caea975..e6533c1 100644
--- a/internal/controller/gpunode_controller.go
+++ b/internal/controller/gpu_controller.go
@@ -29,41 +29,41 @@ import (
 	scheduler "github.com/NexusGPU/tensor-fusion-operator/internal/scheduler"
 )
 
-// GPUNodeReconciler reconciles a GPUNode object
-type GPUNodeReconciler struct {
+// GPUReconciler reconciles a GPU object
+type GPUReconciler struct {
 	client.Client
 	Scheme    *runtime.Scheme
 	Scheduler scheduler.Scheduler
 }
 
-// +kubebuilder:rbac:groups=tensor-fusion.ai,resources=gpunodes,verbs=get;list;watch;create;update;patch;delete
-// +kubebuilder:rbac:groups=tensor-fusion.ai,resources=gpunodes/status,verbs=get;update;patch
-// +kubebuilder:rbac:groups=tensor-fusion.ai,resources=gpunodes/finalizers,verbs=update
+// +kubebuilder:rbac:groups=tensor-fusion.ai,resources=gpus,verbs=get;list;watch;create;update;patch;delete
+// +kubebuilder:rbac:groups=tensor-fusion.ai,resources=gpus/status,verbs=get;update;patch
+// +kubebuilder:rbac:groups=tensor-fusion.ai,resources=gpus/finalizers,verbs=update
 
 // Reconcile is part of the main kubernetes reconciliation loop which aims to
 // move the current state of the cluster closer to the desired state.
-func (r *GPUNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
+func (r *GPUReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
 	// TODO: Calculate tflops and update capacity here
 	return ctrl.Result{}, nil
 }
 
 // SetupWithManager sets up the controller with the Manager.
-func (r *GPUNodeReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager) error {
+func (r *GPUReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager) error {
 	return ctrl.NewControllerManagedBy(mgr).
-		For(&tfv1.GPUNode{}).
-		Named("gpunode").
+		For(&tfv1.GPU{}).
+		Named("gpu").
 		WithEventFilter(
 			predicate.Funcs{
 				CreateFunc: func(e event.CreateEvent) bool {
-					r.Scheduler.OnAdd(e.Object.(*tfv1.GPUNode))
+					r.Scheduler.OnAdd(e.Object.(*tfv1.GPU))
 					return true
 				},
 				UpdateFunc: func(e event.UpdateEvent) bool {
-					r.Scheduler.OnUpdate(e.ObjectOld.(*tfv1.GPUNode), e.ObjectNew.(*tfv1.GPUNode))
+					r.Scheduler.OnUpdate(e.ObjectOld.(*tfv1.GPU), e.ObjectNew.(*tfv1.GPU))
 					return true
 				},
 				DeleteFunc: func(e event.DeleteEvent) bool {
-					r.Scheduler.OnDelete(e.Object.(*tfv1.GPUNode))
+					r.Scheduler.OnDelete(e.Object.(*tfv1.GPU))
 					return true
 				},
 			},
diff --git a/internal/controller/gpunode_controller_test.go b/internal/controller/gpu_controller_test.go
similarity index 84%
rename from internal/controller/gpunode_controller_test.go
rename to internal/controller/gpu_controller_test.go
index 8cf0c89..3742307 100644
--- a/internal/controller/gpunode_controller_test.go
+++ b/internal/controller/gpu_controller_test.go
@@ -22,15 +22,14 @@ import (
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 	"k8s.io/apimachinery/pkg/api/errors"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/types"
 	"sigs.k8s.io/controller-runtime/pkg/reconcile"
 
-	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-
 	tensorfusionaiv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
 )
 
-var _ = Describe("GPUNode Controller", func() {
+var _ = Describe("GPU Controller", func() {
 	Context("When reconciling a resource", func() {
 		const resourceName = "test-resource"
 
@@ -40,13 +39,13 @@ var _ = Describe("GPUNode Controller", func() {
 			Name:      resourceName,
 			Namespace: "default", // TODO(user):Modify as needed
 		}
-		gpunode := &tensorfusionaiv1.GPUNode{}
+		gpu := &tensorfusionaiv1.GPU{}
 
 		BeforeEach(func() {
-			By("creating the custom resource for the Kind GPUNode")
-			err := k8sClient.Get(ctx, typeNamespacedName, gpunode)
+			By("creating the custom resource for the Kind GPU")
+			err := k8sClient.Get(ctx, typeNamespacedName, gpu)
 			if err != nil && errors.IsNotFound(err) {
-				resource := &tensorfusionaiv1.GPUNode{
+				resource := &tensorfusionaiv1.GPU{
 					ObjectMeta: metav1.ObjectMeta{
 						Name:      resourceName,
 						Namespace: "default",
@@ -59,16 +58,16 @@ var _ = Describe("GPUNode Controller", func() {
 
 		AfterEach(func() {
 			// TODO(user): Cleanup logic after each test, like removing the resource instance.
-			resource := &tensorfusionaiv1.GPUNode{}
+			resource := &tensorfusionaiv1.GPU{}
 			err := k8sClient.Get(ctx, typeNamespacedName, resource)
 			Expect(err).NotTo(HaveOccurred())
 
-			By("Cleanup the specific resource instance GPUNode")
+			By("Cleanup the specific resource instance GPU")
 			Expect(k8sClient.Delete(ctx, resource)).To(Succeed())
 		})
 		It("should successfully reconcile the resource", func() {
 			By("Reconciling the created resource")
-			controllerReconciler := &GPUNodeReconciler{
+			controllerReconciler := &GPUReconciler{
 				Client: k8sClient,
 				Scheme: k8sClient.Scheme(),
 			}
diff --git a/internal/controller/tensorfusionconnection_controller.go b/internal/controller/tensorfusionconnection_controller.go
index 65937b1..6280a86 100644
--- a/internal/controller/tensorfusionconnection_controller.go
+++ b/internal/controller/tensorfusionconnection_controller.go
@@ -92,26 +92,26 @@ func (r *TensorFusionConnectionReconciler) Reconcile(ctx context.Context, req ct
 		return ctrl.Result{}, nil
 	}
 
-	var node *tfv1.GPUNode
+	var gpu *tfv1.GPU
 	// If status is not set or pending, try to schedule
 	if connection.Status.Phase == "" || connection.Status.Phase == tfv1.TensorFusionConnectionPending {
-		// Try to get an available node from scheduler
+		// Try to get an available gpu from scheduler
 		var err error
-		node, err = r.Scheduler.Schedule(connection.Spec.Resources.Requests)
+		gpu, err = r.Scheduler.Schedule(connection.Spec.Resources.Requests)
 		if err != nil {
 			log.Info(err.Error())
 			connection.Status.Phase = tfv1.TensorFusionConnectionPending
-		} else if node != nil {
+		} else if gpu != nil {
 			connection.Status.Phase = tfv1.TensorFusionConnectionRunning
-			connection.Status.ConnectionURL = worker.GenerateConnectionURL(node, connection)
-			// Store the node name for cleanup
-			connection.Status.Node = node.Name
+			connection.Status.ConnectionURL = worker.GenerateConnectionURL(gpu, connection)
+			// Store the gpu name for cleanup
+			connection.Status.GPU = gpu.Name
 		} else {
 			connection.Status.Phase = tfv1.TensorFusionConnectionPending
 		}
 	}
 
-	if err := r.MustUpdateStatus(ctx, connection, node); err != nil {
+	if err := r.MustUpdateStatus(ctx, connection, gpu); err != nil {
 		return ctrl.Result{}, err
 	}
 
@@ -123,26 +123,26 @@ func (r *TensorFusionConnectionReconciler) Reconcile(ctx context.Context, req ct
 
 // handleDeletion handles cleanup of external dependencies
 func (r *TensorFusionConnectionReconciler) handleDeletion(ctx context.Context, connection *tfv1.TensorFusionConnection) error {
-	if connection.Status.Node == "" {
-		return nil // No node was allocated, nothing to clean up
+	if connection.Status.GPU == "" {
+		return nil // No gpu was allocated, nothing to clean up
 	}
 
-	// Get the node
-	node := &tfv1.GPUNode{}
-	if err := r.Get(ctx, client.ObjectKey{Name: connection.Status.Node}, node); err != nil {
+	// Get the gpu
+	gpu := &tfv1.GPU{}
+	if err := r.Get(ctx, client.ObjectKey{Name: connection.Status.GPU}, gpu); err != nil {
 		if errors.IsNotFound(err) {
-			// Node is already gone, nothing to do
+			// gpu is already gone, nothing to do
 			return nil
 		}
 		return err
 	}
 
 	// Release the resources
-	if err := r.Scheduler.Release(connection.Spec.Resources.Requests, node); err != nil {
+	if err := r.Scheduler.Release(connection.Spec.Resources.Requests, gpu); err != nil {
 		return err
 	}
 
-	return r.MustUpdateStatus(ctx, connection, node)
+	return r.MustUpdateStatus(ctx, connection, gpu)
 }
 
 // Helper functions to handle finalizers
@@ -165,7 +165,7 @@ func removeString(slice []string, s string) []string {
 	return result
 }
 
-func (r *TensorFusionConnectionReconciler) MustUpdateStatus(ctx context.Context, connection *tfv1.TensorFusionConnection, gpuNode *tfv1.GPUNode) error {
+func (r *TensorFusionConnectionReconciler) MustUpdateStatus(ctx context.Context, connection *tfv1.TensorFusionConnection, gpu *tfv1.GPU) error {
 	return retry.RetryOnConflict(retry.DefaultBackoff, func() error {
 		// Get the latest version of the connection
 		latestConnection := &tfv1.TensorFusionConnection{}
@@ -184,20 +184,20 @@ func (r *TensorFusionConnectionReconciler) MustUpdateStatus(ctx context.Context,
 			return err
 		}
 
-		if gpuNode != nil {
-			// Get the latest version of the node
-			latestNode := &tfv1.GPUNode{}
+		if gpu != nil {
+			// Get the latest version of the gpu
+			latestgpu := &tfv1.GPU{}
 
 			if err := r.Get(ctx, client.ObjectKey{
-				Name:      gpuNode.Name,
-				Namespace: gpuNode.Namespace,
-			}, latestNode); err != nil {
+				Name:      gpu.Name,
+				Namespace: gpu.Namespace,
+			}, latestgpu); err != nil {
 				return err
 			}
 
 			// Update the status fields we care about
-			latestNode.Status.Available = gpuNode.Status.Available
-			if err := r.Status().Update(ctx, latestNode); err != nil {
+			latestgpu.Status.Available = gpu.Status.Available
+			if err := r.Status().Update(ctx, latestgpu); err != nil {
 				return err
 			}
 		}
diff --git a/internal/scheduler/naive.go b/internal/scheduler/naive.go
index 2af423e..7b2627e 100644
--- a/internal/scheduler/naive.go
+++ b/internal/scheduler/naive.go
@@ -9,70 +9,70 @@ import (
 
 // NaiveScheduler implements a simple scheduling strategy
 type NaiveScheduler struct {
-	sync.Mutex
-	nodes map[string]*tfv1.GPUNode
+	sync.RWMutex
+	gpus map[string]*tfv1.GPU
 }
 
 // NewNaiveScheduler creates a new NaiveScheduler
 func NewNaiveScheduler() *NaiveScheduler {
 	return &NaiveScheduler{
-		nodes: make(map[string]*tfv1.GPUNode),
+		gpus: make(map[string]*tfv1.GPU),
 	}
 }
 
 // Schedule implements Scheduler interface
-func (s *NaiveScheduler) Schedule(request tfv1.Resource) (*tfv1.GPUNode, error) {
+func (s *NaiveScheduler) Schedule(request tfv1.Resource) (*tfv1.GPU, error) {
 	s.Lock()
 	defer s.Unlock()
 
-	// Simple strategy: return the first node that has enough resources
-	for _, node := range s.nodes {
-		if node.Status.Available.Tflops.Cmp(request.Tflops) >= 0 &&
-			node.Status.Available.Vram.Cmp(request.Vram) >= 0 {
-			// Update the node's available resources
-			node.Status.Available.Tflops.Sub(request.Tflops)
-			node.Status.Available.Vram.Sub(request.Vram)
-			return node, nil
+	// Simple strategy: return the first gpu that has enough resources
+	for _, gpu := range s.gpus {
+		if gpu.Status.Available.Tflops.Cmp(request.Tflops) >= 0 &&
+			gpu.Status.Available.Vram.Cmp(request.Vram) >= 0 {
+			// Update the gpu's available resources
+			gpu.Status.Available.Tflops.Sub(request.Tflops)
+			gpu.Status.Available.Vram.Sub(request.Vram)
+			return gpu, nil
 		}
 	}
-	return nil, fmt.Errorf("no suitable node found for request: %v", request)
+	return nil, fmt.Errorf("no suitable gpu found for request: %v", request)
 }
 
 // OnAdd implements Scheduler interface
-func (s *NaiveScheduler) OnAdd(node *tfv1.GPUNode) {
+func (s *NaiveScheduler) OnAdd(gpu *tfv1.GPU) {
 	s.Lock()
 	defer s.Unlock()
-	s.nodes[node.Name] = node
+	s.gpus[gpu.Name] = gpu
 }
 
 // OnUpdate implements Scheduler interface
-func (s *NaiveScheduler) OnUpdate(oldNode, newNode *tfv1.GPUNode) {
+func (s *NaiveScheduler) OnUpdate(oldgpu, newgpu *tfv1.GPU) {
 	s.Lock()
 	defer s.Unlock()
-	s.nodes[newNode.Name] = newNode
+	s.gpus[newgpu.Name] = newgpu
 }
 
 // OnDelete implements Scheduler interface
-func (s *NaiveScheduler) OnDelete(node *tfv1.GPUNode) {
+func (s *NaiveScheduler) OnDelete(gpu *tfv1.GPU) {
 	s.Lock()
 	defer s.Unlock()
-	delete(s.nodes, node.Name)
+	delete(s.gpus, gpu.Name)
 }
 
 // Release implements Scheduler interface
-func (s *NaiveScheduler) Release(request tfv1.Resource, node *tfv1.GPUNode) error {
+func (s *NaiveScheduler) Release(request tfv1.Resource, gpu *tfv1.GPU) error {
 	s.Lock()
 	defer s.Unlock()
 
-	existingNode, ok := s.nodes[node.Name]
+	existinggpu, ok := s.gpus[gpu.Name]
 	if !ok {
-		return fmt.Errorf("node %s not found", node.Name)
+		return fmt.Errorf("gpu %s not found", gpu.Name)
 	}
 
 	// Add back the released resources
-	existingNode.Status.Available.Tflops.Add(request.Tflops)
-	existingNode.Status.Available.Vram.Add(request.Vram)
-	// output the updated node
-	node.Status.Available = existingNode.Status.Available
+	existinggpu.Status.Available.Tflops.Add(request.Tflops)
+	existinggpu.Status.Available.Vram.Add(request.Vram)
+	// output the updated gpu
+	gpu.Status.Available = existinggpu.Status.Available
 	return nil
 }
diff --git a/internal/scheduler/naive_test.go b/internal/scheduler/naive_test.go
index 3cdfea4..aac3b17 100644
--- a/internal/scheduler/naive_test.go
+++ b/internal/scheduler/naive_test.go
@@ -8,12 +8,12 @@ import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 )
 
-func createGPUNode(name string, tflops, vram string) *tfv1.GPUNode {
-	return &tfv1.GPUNode{
+func createGPU(name string, tflops, vram string) *tfv1.GPU {
+	return &tfv1.GPU{
 		ObjectMeta: metav1.ObjectMeta{
 			Name: name,
 		},
-		Status: tfv1.GPUNodeStatus{
+		Status: tfv1.GPUStatus{
 			Available: tfv1.Resource{
 				Tflops: resource.MustParse(tflops),
 				Vram:   resource.MustParse(vram),
@@ -33,59 +33,59 @@ func createRequest(tflops, vram string) tfv1.Resource {
 func TestNaiveScheduler_Schedule(t *testing.T) {
 	tests := []struct {
 		name                string
-		nodes               []*tfv1.GPUNode
+		gpus                []*tfv1.GPU
 		request             tfv1.Resource
-		wantNode            string
+		wantgpu             string
 		wantError           bool
 		wantRemainingTflops string
 		wantRemainingVram   string
 	}{
 		{
 			name: "simple match",
-			nodes: []*tfv1.GPUNode{
-				createGPUNode("node1", "100", "16Gi"),
+			gpus: []*tfv1.GPU{
+				createGPU("gpu1", "100", "16Gi"),
 			},
 			request:             createRequest("50", "8Gi"),
-			wantNode:            "node1",
+			wantgpu:             "gpu1",
 			wantError:           false,
 			wantRemainingTflops: "50",
 			wantRemainingVram:   "8Gi",
 		},
 		{
-			name:      "no nodes",
-			nodes:     []*tfv1.GPUNode{},
+			name:      "no gpus",
+			gpus:      []*tfv1.GPU{},
 			request:   createRequest("50", "8Gi"),
-			wantNode:  "",
+			wantgpu:   "",
 			wantError: true,
 		},
 		{
 			name: "insufficient resources",
-			nodes: []*tfv1.GPUNode{
-				createGPUNode("node1", "40", "16Gi"),
+			gpus: []*tfv1.GPU{
+				createGPU("gpu1", "40", "16Gi"),
 			},
 			request:   createRequest("50", "8Gi"),
-			wantNode:  "",
+			wantgpu:   "",
 			wantError: true,
 		},
 		{
-			name: "multiple nodes, first fit",
-			nodes: []*tfv1.GPUNode{
-				createGPUNode("node1", "40", "16Gi"),
-				createGPUNode("node2", "100", "32Gi"),
+			name: "multiple gpus, first fit",
+			gpus: []*tfv1.GPU{
+				createGPU("gpu1", "40", "16Gi"),
+				createGPU("gpu2", "100", "32Gi"),
 			},
 			request:             createRequest("50", "8Gi"),
-			wantNode:            "node2",
+			wantgpu:             "gpu2",
 			wantError:           false,
 			wantRemainingTflops: "50",
 			wantRemainingVram:   "24Gi",
 		},
 		{
 			name: "exact match",
-			nodes: []*tfv1.GPUNode{
-				createGPUNode("node1", "50", "8Gi"),
+			gpus: []*tfv1.GPU{
+				createGPU("gpu1", "50", "8Gi"),
 			},
 			request:             createRequest("50", "8Gi"),
-			wantNode:            "node1",
+			wantgpu:             "gpu1",
 			wantError:           false,
 			wantRemainingTflops: "0",
 			wantRemainingVram:   "0",
@@ -96,9 +96,9 @@ func TestNaiveScheduler_Schedule(t *testing.T) {
 		t.Run(tt.name, func(t *testing.T) {
 			s := NewNaiveScheduler()
 
-			// Add nodes
-			for _, node := range tt.nodes {
-				s.OnAdd(node)
+			// Add gpus
+			for _, gpu := range tt.gpus {
+				s.OnAdd(gpu)
 			}
 
 			// Try to schedule
@@ -113,11 +113,11 @@ func TestNaiveScheduler_Schedule(t *testing.T) {
 			// Check result
 			if !tt.wantError {
 				if got == nil {
-					t.Error("Schedule() returned nil node when error not expected")
+					t.Error("Schedule() returned nil gpu when error not expected")
 					return
 				}
-				if got.Name != tt.wantNode {
-					t.Errorf("Schedule() got node = %v, want %v", got.Name, tt.wantNode)
+				if got.Name != tt.wantgpu {
+					t.Errorf("Schedule() got gpu = %v, want %v", got.Name, tt.wantgpu)
 				}
 
 				// Check remaining resources
@@ -138,57 +138,57 @@ func TestNaiveScheduler_Schedule(t *testing.T) {
 	}
 }
 
-func TestNaiveScheduler_NodeOperations(t *testing.T) {
+func TestNaiveScheduler_gpuOperations(t *testing.T) {
 	s := NewNaiveScheduler()
-	node1 := createGPUNode("node1", "100", "16Gi")
+	gpu1 := createGPU("gpu1", "100", "16Gi")
 	request := createRequest("50", "8Gi")
 
 	// Test OnAdd
-	s.OnAdd(node1)
+	s.OnAdd(gpu1)
 	got, err := s.Schedule(request)
-	if err != nil || got.Name != "node1" {
-		t.Errorf("After OnAdd: Schedule() got = %v, want node1", got)
+	if err != nil || got.Name != "gpu1" {
+		t.Errorf("After OnAdd: Schedule() got = %v, want gpu1", got)
 	}
 
 	// Test OnUpdate
-	node1Updated := createGPUNode("node1", "40", "16Gi")
-	s.OnUpdate(node1, node1Updated)
+	gpu1Updated := createGPU("gpu1", "40", "16Gi")
+	s.OnUpdate(gpu1, gpu1Updated)
 	_, err = s.Schedule(request)
 	if err == nil {
 		t.Error("After OnUpdate: Schedule() should fail with insufficient resources")
 	}
 
 	// Test OnDelete
-	s.OnDelete(node1Updated)
+	s.OnDelete(gpu1Updated)
 	_, err = s.Schedule(request)
 	if err == nil {
-		t.Error("After OnDelete: Schedule() should fail with no nodes")
+		t.Error("After OnDelete: Schedule() should fail with no gpus")
 	}
 }
 
 func TestNaiveScheduler_Release(t *testing.T) {
 	tests := []struct {
 		name                string
-		node               *tfv1.GPUNode
-		schedule           *tfv1.Resource
-		release            *tfv1.Resource
-		wantError          bool
+		gpu                 *tfv1.GPU
+		schedule            *tfv1.Resource
+		release             *tfv1.Resource
+		wantError           bool
 		wantRemainingTflops string
 		wantRemainingVram   string
 	}{
 		{
-			name:      "release non-existent node",
-			node:      createGPUNode("node1", "100", "16Gi"),
+			name:      "release non-existent gpu",
+			gpu:       createGPU("gpu1", "100", "16Gi"),
 			release:   &tfv1.Resource{},
 			wantError: true,
 		},
 		{
 			name: "release after scheduling",
-			node: &tfv1.GPUNode{
+			gpu: &tfv1.GPU{
 				ObjectMeta: metav1.ObjectMeta{
-					Name: "node1",
+					Name: "gpu1",
 				},
-				Status: tfv1.GPUNodeStatus{
+				Status: tfv1.GPUStatus{
 					Capacity: tfv1.Resource{
 						Tflops: resource.MustParse("100"),
 						Vram:   resource.MustParse("16Gi"),
@@ -207,17 +207,17 @@ func TestNaiveScheduler_Release(t *testing.T) {
 				Tflops: resource.MustParse("50"),
 				Vram:   resource.MustParse("8Gi"),
 			},
-			wantError:          false,
+			wantError:           false,
 			wantRemainingTflops: "100",
 			wantRemainingVram:   "16Gi",
 		},
 		{
 			name: "partial release",
-			node: &tfv1.GPUNode{
+			gpu: &tfv1.GPU{
 				ObjectMeta: metav1.ObjectMeta{
-					Name: "node1",
+					Name: "gpu1",
 				},
-				Status: tfv1.GPUNodeStatus{
+				Status: tfv1.GPUStatus{
 					Capacity: tfv1.Resource{
 						Tflops: resource.MustParse("100"),
 						Vram:   resource.MustParse("16Gi"),
@@ -236,7 +236,7 @@ func TestNaiveScheduler_Release(t *testing.T) {
 				Tflops: resource.MustParse("30"),
 				Vram:   resource.MustParse("5Gi"),
 			},
-			wantError:          false,
+			wantError:           false,
 			wantRemainingTflops: "70",
 			wantRemainingVram:   "11Gi",
 		},
@@ -247,31 +247,31 @@ func TestNaiveScheduler_Release(t *testing.T) {
 			s := NewNaiveScheduler()
 
 			if !tt.wantError {
-				// Add the node first
-				s.OnAdd(tt.node)
+				// Add the gpu first
+				s.OnAdd(tt.gpu)
 
 				// Schedule some resources if needed
 				if tt.schedule != nil {
-					node, err := s.Schedule(*tt.schedule)
+					gpu, err := s.Schedule(*tt.schedule)
 					if err != nil {
 						t.Errorf("Schedule() error = %v", err)
 						return
 					}
 
 					// Verify resources were allocated
-					expectedTflops := tt.node.Status.Capacity.Tflops.DeepCopy()
-					expectedVram := tt.node.Status.Capacity.Vram.DeepCopy()
+					expectedTflops := tt.gpu.Status.Capacity.Tflops.DeepCopy()
+					expectedVram := tt.gpu.Status.Capacity.Vram.DeepCopy()
 					expectedTflops.Sub(tt.schedule.Tflops)
 					expectedVram.Sub(tt.schedule.Vram)
-					if node.Status.Available.Tflops.Cmp(expectedTflops) != 0 ||
-						node.Status.Available.Vram.Cmp(expectedVram) != 0 {
+					if gpu.Status.Available.Tflops.Cmp(expectedTflops) != 0 ||
+						gpu.Status.Available.Vram.Cmp(expectedVram) != 0 {
 						t.Errorf("Schedule() did not allocate resources correctly")
 						return
 					}
 				}
 			}
 
-			err := s.Release(*tt.release, tt.node)
+			err := s.Release(*tt.release, tt.gpu)
 			if (err != nil) != tt.wantError {
 				t.Errorf("Release() error = %v, wantError %v", err, tt.wantError)
 				return
@@ -279,12 +279,12 @@ func TestNaiveScheduler_Release(t *testing.T) {
 
 			if !tt.wantError {
 				// Verify resources were restored correctly
-				node := s.nodes[tt.node.Name]
-				if node.Status.Available.Tflops.String() != tt.wantRemainingTflops ||
-					node.Status.Available.Vram.String() != tt.wantRemainingVram {
+				gpu := s.gpus[tt.gpu.Name]
+				if gpu.Status.Available.Tflops.String() != tt.wantRemainingTflops ||
+					gpu.Status.Available.Vram.String() != tt.wantRemainingVram {
 					t.Errorf("Release() resources incorrect, got tflops=%v vram=%v, want tflops=%v vram=%v",
-						node.Status.Available.Tflops.String(),
-						node.Status.Available.Vram.String(),
+						gpu.Status.Available.Tflops.String(),
+						gpu.Status.Available.Vram.String(),
 						tt.wantRemainingTflops,
 						tt.wantRemainingVram)
 				}
diff --git a/internal/scheduler/scheduler.go b/internal/scheduler/scheduler.go
index bd18893..a17ed2b 100644
--- a/internal/scheduler/scheduler.go
+++ b/internal/scheduler/scheduler.go
@@ -4,20 +4,18 @@ import (
 	tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
 )
 
-// Scheduler is the interface that wraps the scheduling methods
+// Scheduler is the interface that wraps the basic scheduling methods.
 type Scheduler interface {
-	// Schedule takes a Resource Request and returns the pointer of the GPU node
-	// that can accommodate the request. If no suitable node is found, it returns
-	// an nil pointer and an error.
-	Schedule(request tfv1.Resource) (*tfv1.GPUNode, error)
+	// Schedule schedules a request to a gpu.
+	Schedule(request tfv1.Resource) (*tfv1.GPU, error)
 
-	// Release frees the allocated resources of a node
-	Release(request tfv1.Resource, node *tfv1.GPUNode) error
+	// Release releases a request from a gpu.
+	Release(request tfv1.Resource, gpu *tfv1.GPU) error
 
-	// OnAdd is called when a new node is added
-	OnAdd(node *tfv1.GPUNode)
-	// OnUpdate is called when a node is modified
-	OnUpdate(oldNode, newNode *tfv1.GPUNode)
-	// OnDelete is called when a node is deleted
-	OnDelete(node *tfv1.GPUNode)
+	// OnAdd is called when a gpu is added.
+	OnAdd(gpu *tfv1.GPU)
+	// OnUpdate is called when a gpu is updated.
+	OnUpdate(oldGPU, newGPU *tfv1.GPU)
+	// OnDelete is called when a gpu is deleted.
+	OnDelete(gpu *tfv1.GPU)
 }
diff --git a/internal/worker/worker.go b/internal/worker/worker.go
index 74b93d3..d3509c5 100644
--- a/internal/worker/worker.go
+++ b/internal/worker/worker.go
@@ -4,6 +4,6 @@ import (
 	tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
 )
 
-func GenerateConnectionURL(_node *tfv1.GPUNode, _connection *tfv1.TensorFusionConnection) string {
+func GenerateConnectionURL(_gpu *tfv1.GPU, _connection *tfv1.TensorFusionConnection) string {
 	return "TODO://"
 }

From e9de90245af54e95504fab7f9160d90dad6a54c7 Mon Sep 17 00:00:00 2001
From: 0x5457 <0x5457@protonmail.com>
Date: Mon, 9 Dec 2024 08:20:44 +0000
Subject: [PATCH 20/22] feat: add release.yaml

---
 .github/workflows/release.yml | 57 +++++++++++++++++++++++++++++++++++
 Makefile                      |  2 +-
 2 files changed, 58 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/release.yml

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 0000000..4194e51
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,57 @@
+name: Release
+
+on:
+  push:
+    branches:
+      - main
+  workflow_dispatch:
+
+jobs:
+  release:
+    runs-on: ubuntu-20.04
+    outputs:
+      published: ${{ steps.semantic.outputs.new_release_published }}
+      version: ${{ steps.semantic.outputs.new_release_version }}
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Semantic Release
+        id: semantic
+        uses: cycjimmy/semantic-release-action@v3.4.1
+        with:
+          semantic_version: 18
+          extra_plugins: |
+            @semantic-release/release-notes-generator@^10
+            @semantic-release/github@^8
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+  publish_image:
+    needs:
+      - release
+    if: needs.release.outputs.published == 'true'
+    runs-on: ubuntu-latest
+    outputs:
+      image_digest: ${{ steps.build.outputs.digest }}
+    steps:
+      - uses: actions/checkout@v3
+      - id: meta
+        uses: docker/metadata-action@v4
+        with:
+          images: |
+            tensorfusion/tensor-fusion-operator
+          tags: |
+            type=semver,pattern={{needs.release.outputs.version}}
+      - name: Login to DockerHub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_PASSWORD }}
+
+      - name: Build and push
+        uses: docker/build-push-action@v3
+        with:
+          context: .
+          push: true
+          tags: ${{ steps.meta.outputs.tags }}
+          no-cache: true
diff --git a/Makefile b/Makefile
index a95fe02..73bfa20 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 # Image URL to use all building/pushing image targets
-IMG ?= controller:latest
+IMG ?= tensorfusion/tensor-fusion-operator:latest
 # ENVTEST_K8S_VERSION refers to the version of kubebuilder assets to be downloaded by envtest binary.
 ENVTEST_K8S_VERSION = 1.31.0
 

From 350525a8d8febc7ec4b888bc5e40c2b259bebe54 Mon Sep 17 00:00:00 2001
From: 0x5457 <0x5457@protonmail.com>
Date: Mon, 9 Dec 2024 08:39:39 +0000
Subject: [PATCH 21/22] feat(webhook): fix tensor fusion pod mutation handler
 tests

---
 .github/workflows/test-e2e.yml                |  5 +++--
 ...sor-fusion.ai_tensorfusionconnections.yaml |  2 +-
 internal/webhook/v1/pod_webhook.go            | 21 ++++++++++++-------
 internal/webhook/v1/pod_webhook_test.go       |  9 ++++----
 4 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/test-e2e.yml b/.github/workflows/test-e2e.yml
index 8780644..70a12f2 100644
--- a/.github/workflows/test-e2e.yml
+++ b/.github/workflows/test-e2e.yml
@@ -1,8 +1,9 @@
 name: E2E Tests
 
 on:
-  push:
-  pull_request:
+  workflow_dispatch:
+  # push:
+  # pull_request:
 
 jobs:
   test-e2e:
diff --git a/config/crd/bases/tensor-fusion.ai_tensorfusionconnections.yaml b/config/crd/bases/tensor-fusion.ai_tensorfusionconnections.yaml
index 7b2b288..37d47be 100644
--- a/config/crd/bases/tensor-fusion.ai_tensorfusionconnections.yaml
+++ b/config/crd/bases/tensor-fusion.ai_tensorfusionconnections.yaml
@@ -91,7 +91,7 @@ spec:
             properties:
               connectionURL:
                 type: string
-              node:
+              gpu:
                 type: string
               phase:
                 type: string
diff --git a/internal/webhook/v1/pod_webhook.go b/internal/webhook/v1/pod_webhook.go
index 2b9bfcf..16cc35d 100644
--- a/internal/webhook/v1/pod_webhook.go
+++ b/internal/webhook/v1/pod_webhook.go
@@ -67,6 +67,10 @@ func (m *TensorFusionPodMutator) Handle(ctx context.Context, req admission.Reque
 	log.Info("Mutating pod", "name", pod.Name, "namespace", pod.Namespace)
 
 	reqs := parseTFReq(pod)
+	if len(reqs) == 0 {
+		return admission.Allowed("no tensor fusion requirements found")
+	}
+
 	// 1. Inject initContainer and env variables
 	patches, err := m.patchTFClient(pod, reqs)
 	if err != nil {
@@ -108,9 +112,14 @@ func parseTFReq(pod *corev1.Pod) []TFReq {
 	for _, container := range pod.Spec.Containers {
 		containerName := container.Name
 
-		// Check if tensor fusion is enabled for this container
-		enableKey := fmt.Sprintf(constants.EnableContainerAnnotationFormat, containerName)
-		if enableStr, ok := pod.Annotations[enableKey]; !ok || enableStr != "true" {
+		// Check if TF requirements exist for this container
+		tflopsKey := fmt.Sprintf(constants.TFLOPSContainerAnnotationFormat, containerName)
+		vramKey := fmt.Sprintf(constants.VRAMContainerAnnotationFormat, containerName)
+
+		tflopsStr, hasTflops := pod.Annotations[tflopsKey]
+		vramStr, hasVram := pod.Annotations[vramKey]
+
+		if !hasTflops && !hasVram {
 			continue
 		}
 
@@ -119,8 +128,7 @@ func parseTFReq(pod *corev1.Pod) []TFReq {
 		}
 
 		// Parse TFLOPS requirement
-		tflopsKey := fmt.Sprintf(constants.TFLOPSContainerAnnotationFormat, containerName)
-		if tflopsStr, ok := pod.Annotations[tflopsKey]; ok {
+		if hasTflops {
 			tflops, err := resource.ParseQuantity(tflopsStr)
 			if err == nil {
 				req.Tflops = tflops
@@ -128,8 +136,7 @@ func parseTFReq(pod *corev1.Pod) []TFReq {
 		}
 
 		// Parse VRAM requirement
-		vramKey := fmt.Sprintf(constants.VRAMContainerAnnotationFormat, containerName)
-		if vramStr, ok := pod.Annotations[vramKey]; ok {
+		if hasVram {
 			vram, err := resource.ParseQuantity(vramStr)
 			if err == nil {
 				req.Vram = vram
diff --git a/internal/webhook/v1/pod_webhook_test.go b/internal/webhook/v1/pod_webhook_test.go
index 87eaa79..73b401d 100644
--- a/internal/webhook/v1/pod_webhook_test.go
+++ b/internal/webhook/v1/pod_webhook_test.go
@@ -23,6 +23,7 @@ import (
 
 	tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
 	"github.com/NexusGPU/tensor-fusion-operator/internal/config"
+	"github.com/NexusGPU/tensor-fusion-operator/internal/constants"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 	admissionv1 "k8s.io/api/admission/v1"
@@ -67,8 +68,8 @@ var _ = Describe("TensorFusionPodMutator", func() {
 					Name:      "test-pod",
 					Namespace: "default",
 					Annotations: map[string]string{
-						"tf.nexusgpu.com/tflops": "100",
-						"tf.nexusgpu.com/vram":   "16Gi",
+						constants.TensorFusionDomain + "/tflops-main": "100",
+						constants.TensorFusionDomain + "/vram-main":   "16Gi",
 					},
 				},
 				Spec: corev1.PodSpec{
@@ -158,8 +159,8 @@ var _ = Describe("TensorFusionPodMutator", func() {
 			pod := &corev1.Pod{
 				ObjectMeta: metav1.ObjectMeta{
 					Annotations: map[string]string{
-						"tf.nexusgpu.com/tflops": "100",
-						"tf.nexusgpu.com/vram":   "16Gi",
+						constants.TensorFusionDomain + "/tflops-test-container": "100",
+						constants.TensorFusionDomain + "/vram-test-container":   "16Gi",
 					},
 				},
 				Spec: corev1.PodSpec{

From 4213e1e2077653a2ee563e152f00a40d39fb1255 Mon Sep 17 00:00:00 2001
From: 0x5457 <0x5457@protonmail.com>
Date: Mon, 9 Dec 2024 09:18:15 +0000
Subject: [PATCH 22/22] ci: update release workflow to use ubuntu-latest runner

---
 .github/workflows/release.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 4194e51..44d7181 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -8,7 +8,7 @@ on:
 
 jobs:
   release:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-latest
     outputs:
       published: ${{ steps.semantic.outputs.new_release_published }}
       version: ${{ steps.semantic.outputs.new_release_version }}