Skip to content

🐛 Fix flaky Helm installations by separating provider CRs from operator deployment #832

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
225 changes: 225 additions & 0 deletions .github/workflows/smoke-test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
name: Smoke Test

on:
pull_request:
branches: [main, 'release-*']
push:
branches: [main]
workflow_dispatch:

permissions:
contents: read

env:
CLUSTER_NAME: capi-quickstart
KIND_CLUSTER_NAME: capi-operator-smoke-test
KUBERNETES_VERSION: v1.33.0
CONTROLLER_IMG: cluster-api-operator
TAG: smoke-test

jobs:
smoke-test:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Set up Go
uses: actions/setup-go@v5
with:
go-version-file: 'go.mod'

- name: Install tools
run: |
# kubectl
curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
chmod +x kubectl && sudo mv kubectl /usr/local/bin/

# yq
wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O yq
chmod +x yq && sudo mv yq /usr/local/bin/

# helm
curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash

# clusterctl
curl -L https://github.com/kubernetes-sigs/cluster-api/releases/latest/download/clusterctl-linux-amd64 -o clusterctl
chmod +x clusterctl && sudo mv clusterctl /usr/local/bin/

- name: Build Docker image
run: |
make docker-build
docker tag ${CONTROLLER_IMG}-amd64:${TAG} ${CONTROLLER_IMG}:${TAG}

- name: Build charts
run: |
make release-chart
echo "HELM_CHART_TAG=$(make -s -f Makefile -p | grep '^HELM_CHART_TAG :=' | cut -d' ' -f3)" >> $GITHUB_ENV

- name: Create kind cluster
run: |
chmod +x ./hack/ensure-kind.sh
./hack/ensure-kind.sh

cat <<EOF > /tmp/kind-config.yaml
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
networking:
ipFamily: ipv4
nodes:
- role: control-plane
extraMounts:
- hostPath: /var/run/docker.sock
containerPath: /var/run/docker.sock
containerdConfigPatches:
- |-
[plugins."io.containerd.grpc.v1.cri".registry.mirrors."docker.io"]
endpoint = ["https://mirror.gcr.io", "https://registry-1.docker.io"]
EOF

kind create cluster --name ${KIND_CLUSTER_NAME} --config /tmp/kind-config.yaml --wait 5m
kind load docker-image ${CONTROLLER_IMG}:${TAG} --name ${KIND_CLUSTER_NAME}

- name: Install cert-manager
run: |
helm repo add jetstack https://charts.jetstack.io
helm repo update
helm install cert-manager jetstack/cert-manager \
--namespace cert-manager \
--create-namespace \
--set installCRDs=true \
--wait \
--timeout 5m

- name: Install Cluster API Operator
run: |
CHART_PACKAGE="out/package/cluster-api-operator-${HELM_CHART_TAG}.tgz"
helm install capi-operator "$CHART_PACKAGE" \
--create-namespace \
-n capi-operator-system \
--set image.manager.repository=${CONTROLLER_IMG} \
--set image.manager.tag=${TAG} \
--set image.manager.pullPolicy=IfNotPresent \
--wait \
--timeout 90s

- name: Deploy providers
run: |
cat <<EOF > /tmp/providers-values.yaml
core:
cluster-api:
namespace: capi-system
bootstrap:
kubeadm:
namespace: capi-kubeadm-bootstrap-system
controlPlane:
kubeadm:
namespace: capi-kubeadm-control-plane-system
infrastructure:
docker:
namespace: capd-system
manager:
featureGates:
core:
ClusterTopology: true
ClusterResourceSet: true
MachinePool: true
kubeadm:
ClusterTopology: true
MachinePool: true
docker:
ClusterTopology: true
EOF

PROVIDERS_CHART_PACKAGE="out/package/cluster-api-operator-providers-${HELM_CHART_TAG}.tgz"
helm install capi-providers "$PROVIDERS_CHART_PACKAGE" -f /tmp/providers-values.yaml --wait

- name: Wait for providers
run: |
kubectl wait --for=condition=Ready --timeout=300s -n capi-system coreprovider/cluster-api
kubectl wait --for=condition=Ready --timeout=300s -n capi-kubeadm-bootstrap-system bootstrapprovider/kubeadm
kubectl wait --for=condition=Ready --timeout=300s -n capi-kubeadm-control-plane-system controlplaneprovider/kubeadm
kubectl wait --for=condition=Ready --timeout=300s -n capd-system infrastructureprovider/docker

kubectl wait --for=condition=Available --timeout=300s -n capi-system deployment/capi-controller-manager
kubectl wait --for=condition=Available --timeout=300s -n capi-kubeadm-bootstrap-system deployment/capi-kubeadm-bootstrap-controller-manager
kubectl wait --for=condition=Available --timeout=300s -n capi-kubeadm-control-plane-system deployment/capi-kubeadm-control-plane-controller-manager
kubectl wait --for=condition=Available --timeout=300s -n capd-system deployment/capd-controller-manager

- name: Verify providers
run: |
kubectl get coreprovider,bootstrapprovider,controlplaneprovider,infrastructureprovider -A
kubectl get pods -A | grep -E "(capi-|capd-)"

- name: Create workload cluster
run: |
clusterctl generate cluster ${CLUSTER_NAME} \
--infrastructure docker:v1.10.0 \
--flavor development \
--kubernetes-version ${KUBERNETES_VERSION} \
--control-plane-machine-count=1 \
--worker-machine-count=2 \
> capi-quickstart.yaml

kubectl apply -f capi-quickstart.yaml

- name: Get workload cluster kubeconfig
run: |
timeout 300s bash -c "until kubectl get secret ${CLUSTER_NAME}-kubeconfig -n default &>/dev/null; do sleep 2; done"
clusterctl get kubeconfig ${CLUSTER_NAME} --namespace default > ${CLUSTER_NAME}.kubeconfig
echo "KUBECONFIG=$(pwd)/${CLUSTER_NAME}.kubeconfig" >> $GITHUB_ENV

- name: Wait for workload cluster API server
run: |
timeout 300s bash -c "until kubectl cluster-info &>/dev/null; do sleep 5; done"

- name: Install CNI
run: |
kubectl apply -f https://raw.githubusercontent.com/projectcalico/calico/v3.26.1/manifests/calico.yaml
kubectl wait --for=condition=Ready --timeout=300s pods -n tigera-operator -l app.kubernetes.io/name=tigera-operator || true
kubectl wait --for=condition=Ready --timeout=300s pods -n calico-system --all || true

- name: Wait for nodes
run: |
kubectl wait --for=condition=Ready --timeout=300s nodes --all
kubectl get nodes -o wide

- name: Verify cluster
run: |
kubectl get po -A
kubectl wait --for=condition=Ready --timeout=300s pods -n kube-system -l k8s-app=kube-proxy
kubectl wait --for=condition=Ready --timeout=300s pods -n kube-system -l component=kube-apiserver
kubectl wait --for=condition=Ready --timeout=300s pods -n kube-system -l component=kube-controller-manager
kubectl wait --for=condition=Ready --timeout=300s pods -n kube-system -l component=kube-scheduler

- name: Collect logs on failure
if: failure()
run: |
echo "=== Recent Events ==="
kubectl get events -A --sort-by='.lastTimestamp' | tail -50

echo -e "\n=== Provider Logs ==="
kubectl logs -n capi-operator-system deployment/capi-operator-cluster-api-operator --tail=50 || true
kubectl logs -n capi-system deployment/capi-controller-manager --tail=50 || true
kubectl logs -n capd-system deployment/capd-controller-manager --tail=50 || true

echo -e "\n=== Cluster Resources ==="
kubectl get cluster,dockercluster,kubeadmcontrolplane,machine,dockermachine -A -o wide || true

echo -e "\n=== Failed Pods ==="
kubectl get pods -A | grep -v Running | grep -v Completed || true

if [ -f "${CLUSTER_NAME}.kubeconfig" ]; then
export KUBECONFIG=$(pwd)/${CLUSTER_NAME}.kubeconfig
echo -e "\n=== Workload Cluster Status ==="
kubectl get nodes -o wide || true
kubectl get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded || true
fi

- name: Cleanup
if: always()
run: |
kind delete cluster --name ${CLUSTER_NAME} || true
kind delete cluster --name ${KIND_CLUSTER_NAME} || true
11 changes: 10 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ endif
RELEASE_ALIAS_TAG ?= $(PULL_BASE_REF)
RELEASE_DIR := $(ROOT)/out
CHART_DIR := $(RELEASE_DIR)/charts/cluster-api-operator
CHART_PROVIDERS_DIR := $(RELEASE_DIR)/charts/cluster-api-operator-providers
CHART_PACKAGE_DIR := $(RELEASE_DIR)/package

# Set --output-base for conversion-gen if we are not within GOPATH
Expand Down Expand Up @@ -455,6 +456,9 @@ $(CHART_DIR):
$(CHART_PACKAGE_DIR):
mkdir -p $(CHART_PACKAGE_DIR)

$(CHART_PROVIDERS_DIR):
mkdir -p $(CHART_PROVIDERS_DIR)/templates

.PHONY: release
release: clean-release $(RELEASE_DIR) ## Builds and push container images using the latest git tag for the commit.
@if [ -z "${RELEASE_TAG}" ]; then echo "RELEASE_TAG is not set"; exit 1; fi
Expand Down Expand Up @@ -485,11 +489,16 @@ release-manifests: $(KUSTOMIZE) $(RELEASE_DIR) ## Builds the manifests to publis
$(KUSTOMIZE) build ./config/default > $(RELEASE_DIR)/operator-components.yaml

.PHONY: release-chart
release-chart: $(HELM) $(KUSTOMIZE) $(RELEASE_DIR) $(CHART_DIR) $(CHART_PACKAGE_DIR) ## Builds the chart to publish with a release
release-chart: $(HELM) $(KUSTOMIZE) $(RELEASE_DIR) $(CHART_DIR) $(CHART_PROVIDERS_DIR) $(CHART_PACKAGE_DIR) ## Builds the chart to publish with a release
# cluster-api-operator チャートの処理
cp -rf $(ROOT)/hack/charts/cluster-api-operator/. $(CHART_DIR)
$(KUSTOMIZE) build ./config/chart > $(CHART_DIR)/templates/operator-components.yaml
$(HELM) package $(CHART_DIR) --app-version=$(HELM_CHART_TAG) --version=$(HELM_CHART_TAG) --destination=$(CHART_PACKAGE_DIR)

# cluster-api-operator-providers チャートの処理
cp -rf $(ROOT)/hack/charts/cluster-api-operator-providers/. $(CHART_PROVIDERS_DIR)
$(HELM) package $(CHART_PROVIDERS_DIR) --app-version=$(HELM_CHART_TAG) --version=$(HELM_CHART_TAG) --destination=$(CHART_PACKAGE_DIR)

.PHONY: release-staging
release-staging: ## Builds and push container images and manifests to the staging bucket.
$(MAKE) docker-build-all
Expand Down
23 changes: 23 additions & 0 deletions hack/charts/cluster-api-operator-providers/.helmignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Patterns to ignore when building packages.
# This supports shell glob matching, relative path matching, and
# negation (prefixed with !). Only one pattern per line.
.DS_Store
# Common VCS dirs
.git/
.gitignore
.bzr/
.bzrignore
.hg/
.hgignore
.svn/
# Common backup files
*.swp
*.bak
*.tmp
*.orig
*~
# Various IDEs
.project
.idea/
*.tmproj
.vscode/
6 changes: 6 additions & 0 deletions hack/charts/cluster-api-operator-providers/Chart.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
apiVersion: v2
name: cluster-api-operator-providers
description: Cluster API Provider Custom Resources
type: application
version: 0.0.0
appVersion: "0.0.0"
24 changes: 24 additions & 0 deletions hack/charts/cluster-api-operator-providers/templates/_helpers.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{{/* vim: set filetype=mustache: */}}
{{/*
Expand the name of the chart.
*/}}
{{- define "capi-operator.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
{{- end -}}

{{/*
Create a default fully qualified app name.
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
*/}}
{{- define "capi-operator.fullname" -}}
{{- if .Values.fullnameOverride -}}
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
{{- else -}}
{{- $name := default .Chart.Name .Values.nameOverride -}}
{{- if contains $name .Release.Name -}}
{{- .Release.Name | trunc 63 | trimSuffix "-" -}}
{{- else -}}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{- end -}}
{{- end -}}
65 changes: 65 additions & 0 deletions hack/charts/cluster-api-operator-providers/values.schema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"properties": {
"core": {
"oneOf": [
{ "type": "object" },
{ "type": "null" }
]
},
"bootstrap": {
"type": "object",
"oneOf": [
{ "type": "object" },
{ "type": "null" }
]
},
"controlPlane": {
"type": "object",
"oneOf": [
{ "type": "object" },
{ "type": "null" }
]
},
"infrastructure": {
"type": "object",
"oneOf": [
{ "type": "object" },
{ "type": "null" }
]
},
"addon": {
"type": "object",
"oneOf": [
{ "type": "object" },
{ "type": "null" }
]
},
"ipam": {
"type": "object",
"oneOf": [
{ "type": "object" },
{ "type": "null" }
]
},
"manager": {
"type": "object",
"properties": {
"featureGates": {
"type": "object"
}
}
},
"fetchConfig": {
"type": "object"
},
"configSecret": {
"type": "object"
},
"enableHelmHook": {
"type": "boolean",
"default": false
}
}
}
Loading
Loading