Skip to content

Commit e4c26a9

Browse files
fix(infra): Fix helm test timeout (#5386)
1 parent 36c96f2 commit e4c26a9

15 files changed

+191
-29
lines changed

.github/workflows/pr-helm-chart-testing.yml

Lines changed: 147 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -53,27 +53,154 @@ jobs:
5353
if: steps.list-changed.outputs.changed == 'true'
5454
uses: helm/kind-action@v1.12.0
5555

56-
- name: Run chart-testing (install)
56+
- name: Pre-install cluster status check
5757
if: steps.list-changed.outputs.changed == 'true'
58-
run: ct install --all \
59-
--helm-extra-set-args="\
60-
--set=nginx.enabled=false \
61-
--set=postgresql.enabled=false \
62-
--set=redis.enabled=false \
63-
--set=minio.enabled=false \
64-
--set=vespa.enabled=false \
65-
--set=slackbot.enabled=false \
66-
--set=api.replicaCount=0 \
67-
--set=inferenceCapability.replicaCount=0 \
68-
--set=indexCapability.replicaCount=0 \
69-
--set=celery_beat.replicaCount=0 \
70-
--set=celery_worker_heavy.replicaCount=0 \
71-
--set=celery_worker_docprocessing.replicaCount=0 \
72-
--set=celery_worker_light.replicaCount=0 \
73-
--set=celery_worker_monitoring.replicaCount=0 \
74-
--set=celery_worker_primary.replicaCount=0 \
75-
--set=celery_worker_user_files_indexing.replicaCount=0" \
76-
--debug --config ct.yaml
58+
run: |
59+
echo "=== Pre-install Cluster Status ==="
60+
kubectl get nodes -o wide
61+
kubectl get pods --all-namespaces
62+
kubectl get storageclass
63+
64+
- name: Add Helm repositories and update
65+
if: steps.list-changed.outputs.changed == 'true'
66+
run: |
67+
echo "=== Adding Helm repositories ==="
68+
helm repo add bitnami https://charts.bitnami.com/bitnami
69+
helm repo add vespa https://onyx-dot-app.github.io/vespa-helm-charts
70+
helm repo update
71+
72+
- name: Pre-pull critical images
73+
if: steps.list-changed.outputs.changed == 'true'
74+
run: |
75+
echo "=== Pre-pulling critical images to avoid timeout ==="
76+
# Get kind cluster name
77+
KIND_CLUSTER=$(kubectl config current-context | sed 's/kind-//')
78+
echo "Kind cluster: $KIND_CLUSTER"
79+
80+
# Pre-pull images that are likely to be used
81+
echo "Pre-pulling PostgreSQL image..."
82+
docker pull postgres:15-alpine || echo "Failed to pull postgres:15-alpine"
83+
kind load docker-image postgres:15-alpine --name $KIND_CLUSTER || echo "Failed to load postgres image"
84+
85+
echo "Pre-pulling Redis image..."
86+
docker pull redis:7-alpine || echo "Failed to pull redis:7-alpine"
87+
kind load docker-image redis:7-alpine --name $KIND_CLUSTER || echo "Failed to load redis image"
88+
89+
echo "Pre-pulling Onyx images..."
90+
docker pull docker.io/onyxdotapp/onyx-web-server:latest || echo "Failed to pull onyx web server"
91+
docker pull docker.io/onyxdotapp/onyx-backend:latest || echo "Failed to pull onyx backend"
92+
kind load docker-image docker.io/onyxdotapp/onyx-web-server:latest --name $KIND_CLUSTER || echo "Failed to load onyx web server"
93+
kind load docker-image docker.io/onyxdotapp/onyx-backend:latest --name $KIND_CLUSTER || echo "Failed to load onyx backend"
94+
95+
echo "=== Images loaded into Kind cluster ==="
96+
docker exec $KIND_CLUSTER-control-plane crictl images | grep -E "(postgres|redis|onyx)" || echo "Some images may still be loading..."
97+
98+
- name: Validate chart dependencies
99+
if: steps.list-changed.outputs.changed == 'true'
100+
run: |
101+
echo "=== Validating chart dependencies ==="
102+
cd deployment/helm/charts/onyx
103+
helm dependency update
104+
helm lint .
105+
106+
- name: Run chart-testing (install) with enhanced monitoring
107+
timeout-minutes: 25
108+
if: steps.list-changed.outputs.changed == 'true'
109+
run: |
110+
echo "=== Starting chart installation with monitoring ==="
111+
112+
# Function to monitor cluster state
113+
monitor_cluster() {
114+
while true; do
115+
echo "=== Cluster Status Check at $(date) ==="
116+
# Only show non-running pods to reduce noise
117+
NON_RUNNING_PODS=$(kubectl get pods --all-namespaces --field-selector=status.phase!=Running,status.phase!=Succeeded --no-headers 2>/dev/null | wc -l)
118+
if [ "$NON_RUNNING_PODS" -gt 0 ]; then
119+
echo "Non-running pods:"
120+
kubectl get pods --all-namespaces --field-selector=status.phase!=Running,status.phase!=Succeeded
121+
else
122+
echo "All pods running successfully"
123+
fi
124+
# Only show recent events if there are issues
125+
RECENT_EVENTS=$(kubectl get events --sort-by=.lastTimestamp --all-namespaces --field-selector=type!=Normal 2>/dev/null | tail -5)
126+
if [ -n "$RECENT_EVENTS" ]; then
127+
echo "Recent warnings/errors:"
128+
echo "$RECENT_EVENTS"
129+
fi
130+
sleep 60
131+
done
132+
}
133+
134+
# Start monitoring in background
135+
monitor_cluster &
136+
MONITOR_PID=$!
137+
138+
# Set up cleanup
139+
cleanup() {
140+
echo "=== Cleaning up monitoring process ==="
141+
kill $MONITOR_PID 2>/dev/null || true
142+
echo "=== Final cluster state ==="
143+
kubectl get pods --all-namespaces
144+
kubectl get events --all-namespaces --sort-by=.lastTimestamp | tail -20
145+
}
146+
147+
# Trap cleanup on exit
148+
trap cleanup EXIT
149+
150+
# Run the actual installation with detailed logging
151+
echo "=== Starting ct install ==="
152+
ct install --all \
153+
--helm-extra-set-args="\
154+
--set=nginx.enabled=false \
155+
--set=minio.enabled=false \
156+
--set=vespa.enabled=false \
157+
--set=slackbot.enabled=false \
158+
--set=postgresql.enabled=true \
159+
--set=postgresql.primary.persistence.enabled=false \
160+
--set=redis.enabled=true \
161+
--set=webserver.replicaCount=1 \
162+
--set=api.replicaCount=0 \
163+
--set=inferenceCapability.replicaCount=0 \
164+
--set=indexCapability.replicaCount=0 \
165+
--set=celery_beat.replicaCount=0 \
166+
--set=celery_worker_heavy.replicaCount=0 \
167+
--set=celery_worker_docfetching.replicaCount=0 \
168+
--set=celery_worker_docprocessing.replicaCount=0 \
169+
--set=celery_worker_light.replicaCount=0 \
170+
--set=celery_worker_monitoring.replicaCount=0 \
171+
--set=celery_worker_primary.replicaCount=0 \
172+
--set=celery_worker_user_files_indexing.replicaCount=0" \
173+
--helm-extra-args="--timeout 900s --debug" \
174+
--debug --config ct.yaml
175+
176+
echo "=== Installation completed successfully ==="
177+
kubectl get pods --all-namespaces
178+
179+
- name: Post-install verification
180+
if: steps.list-changed.outputs.changed == 'true'
181+
run: |
182+
echo "=== Post-install verification ==="
183+
kubectl get pods --all-namespaces
184+
kubectl get services --all-namespaces
185+
# Only show issues if they exist
186+
kubectl describe pods --all-namespaces | grep -A 5 -B 2 "Failed\|Error\|Warning" || echo "No pod issues found"
187+
188+
- name: Cleanup on failure
189+
if: failure() && steps.list-changed.outputs.changed == 'true'
190+
run: |
191+
echo "=== Cleanup on failure ==="
192+
echo "=== Final cluster state ==="
193+
kubectl get pods --all-namespaces
194+
kubectl get events --all-namespaces --sort-by=.lastTimestamp | tail -10
195+
196+
echo "=== Pod descriptions for debugging ==="
197+
kubectl describe pods --all-namespaces | grep -A 10 -B 3 "Failed\|Error\|Warning\|Pending" || echo "No problematic pods found"
198+
199+
echo "=== Recent logs for debugging ==="
200+
kubectl logs --all-namespaces --tail=50 | grep -i "error\|timeout\|failed\|pull" || echo "No error logs found"
201+
202+
echo "=== Helm releases ==="
203+
helm list --all-namespaces
77204
# the following would install only changed charts, but we only have one chart so
78205
# don't worry about that for now
79206
# run: ct install --target-branch ${{ github.event.repository.default_branch }}

deployment/helm/charts/onyx/Chart.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ home: https://www.onyx.app/
55
sources:
66
- "https://github.com/onyx-dot-app/onyx"
77
type: application
8-
version: 0.2.10
8+
version: 0.2.11
99
appVersion: latest
1010
annotations:
1111
category: Productivity

deployment/helm/charts/onyx/templates/api-deployment.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
{{- if gt (int .Values.api.replicaCount) 0 }}
12
apiVersion: apps/v1
23
kind: Deployment
34
metadata:
@@ -69,3 +70,4 @@ spec:
6970
volumes:
7071
{{- toYaml . | nindent 8 }}
7172
{{- end }}
73+
{{- end }}

deployment/helm/charts/onyx/templates/celery-beat.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
1+
{{- if gt (int .Values.celery_beat.replicaCount) 0 }}
12
apiVersion: apps/v1
23
kind: Deployment
34
metadata:
45
name: {{ include "onyx-stack.fullname" . }}-celery-beat
56
labels:
67
{{- include "onyx-stack.labels" . | nindent 4 }}
78
spec:
8-
replicas: 1
9+
replicas: {{ .Values.celery_beat.replicaCount }}
910
selector:
1011
matchLabels:
1112
{{- include "onyx-stack.selectorLabels" . | nindent 6 }}
@@ -86,3 +87,4 @@ spec:
8687
volumes:
8788
{{- toYaml . | nindent 8 }}
8889
{{- end }}
90+
{{- end }}

deployment/helm/charts/onyx/templates/celery-worker-docfetching.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
{{- if gt (int .Values.celery_worker_docfetching.replicaCount) 0 }}
12
apiVersion: apps/v1
23
kind: Deployment
34
metadata:
@@ -93,4 +94,5 @@ spec:
9394
{{- with .Values.celery_worker_docfetching.volumes }}
9495
volumes:
9596
{{- toYaml . | nindent 8 }}
96-
{{- end }}
97+
{{- end }}
98+
{{- end }}

deployment/helm/charts/onyx/templates/celery-worker-docprocessing.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
{{- if gt (int .Values.celery_worker_docprocessing.replicaCount) 0 }}
12
apiVersion: apps/v1
23
kind: Deployment
34
metadata:
@@ -96,3 +97,4 @@ spec:
9697
volumes:
9798
{{- toYaml . | nindent 8 }}
9899
{{- end }}
100+
{{- end }}

deployment/helm/charts/onyx/templates/celery-worker-heavy.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
{{- if gt (int .Values.celery_worker_heavy.replicaCount) 0 }}
12
apiVersion: apps/v1
23
kind: Deployment
34
metadata:
@@ -91,3 +92,4 @@ spec:
9192
volumes:
9293
{{- toYaml . | nindent 8 }}
9394
{{- end }}
95+
{{- end }}

deployment/helm/charts/onyx/templates/celery-worker-light.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
{{- if gt (int .Values.celery_worker_light.replicaCount) 0 }}
12
apiVersion: apps/v1
23
kind: Deployment
34
metadata:
@@ -91,3 +92,4 @@ spec:
9192
volumes:
9293
{{- toYaml . | nindent 8 }}
9394
{{- end }}
95+
{{- end }}

deployment/helm/charts/onyx/templates/celery-worker-monitoring.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
{{- if gt (int .Values.celery_worker_monitoring.replicaCount) 0 }}
12
apiVersion: apps/v1
23
kind: Deployment
34
metadata:
@@ -91,3 +92,4 @@ spec:
9192
volumes:
9293
{{- toYaml . | nindent 8 }}
9394
{{- end }}
95+
{{- end }}

deployment/helm/charts/onyx/templates/celery-worker-primary.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
{{- if gt (int .Values.celery_worker_primary.replicaCount) 0 }}
12
apiVersion: apps/v1
23
kind: Deployment
34
metadata:
@@ -91,3 +92,4 @@ spec:
9192
volumes:
9293
{{- toYaml . | nindent 8 }}
9394
{{- end }}
95+
{{- end }}

0 commit comments

Comments
 (0)