@@ -53,27 +53,154 @@ jobs:
53
53
if : steps.list-changed.outputs.changed == 'true'
54
54
uses : helm/kind-action@v1.12.0
55
55
56
- - name : Run chart-testing ( install)
56
+ - name : Pre- install cluster status check
57
57
if : steps.list-changed.outputs.changed == 'true'
58
- run : ct install --all \
59
- --helm-extra-set-args="\
60
- --set=nginx.enabled=false \
61
- --set=postgresql.enabled=false \
62
- --set=redis.enabled=false \
63
- --set=minio.enabled=false \
64
- --set=vespa.enabled=false \
65
- --set=slackbot.enabled=false \
66
- --set=api.replicaCount=0 \
67
- --set=inferenceCapability.replicaCount=0 \
68
- --set=indexCapability.replicaCount=0 \
69
- --set=celery_beat.replicaCount=0 \
70
- --set=celery_worker_heavy.replicaCount=0 \
71
- --set=celery_worker_docprocessing.replicaCount=0 \
72
- --set=celery_worker_light.replicaCount=0 \
73
- --set=celery_worker_monitoring.replicaCount=0 \
74
- --set=celery_worker_primary.replicaCount=0 \
75
- --set=celery_worker_user_files_indexing.replicaCount=0" \
76
- --debug --config ct.yaml
58
+ run : |
59
+ echo "=== Pre-install Cluster Status ==="
60
+ kubectl get nodes -o wide
61
+ kubectl get pods --all-namespaces
62
+ kubectl get storageclass
63
+
64
+ - name : Add Helm repositories and update
65
+ if : steps.list-changed.outputs.changed == 'true'
66
+ run : |
67
+ echo "=== Adding Helm repositories ==="
68
+ helm repo add bitnami https://charts.bitnami.com/bitnami
69
+ helm repo add vespa https://onyx-dot-app.github.io/vespa-helm-charts
70
+ helm repo update
71
+
72
+ - name : Pre-pull critical images
73
+ if : steps.list-changed.outputs.changed == 'true'
74
+ run : |
75
+ echo "=== Pre-pulling critical images to avoid timeout ==="
76
+ # Get kind cluster name
77
+ KIND_CLUSTER=$(kubectl config current-context | sed 's/kind-//')
78
+ echo "Kind cluster: $KIND_CLUSTER"
79
+
80
+ # Pre-pull images that are likely to be used
81
+ echo "Pre-pulling PostgreSQL image..."
82
+ docker pull postgres:15-alpine || echo "Failed to pull postgres:15-alpine"
83
+ kind load docker-image postgres:15-alpine --name $KIND_CLUSTER || echo "Failed to load postgres image"
84
+
85
+ echo "Pre-pulling Redis image..."
86
+ docker pull redis:7-alpine || echo "Failed to pull redis:7-alpine"
87
+ kind load docker-image redis:7-alpine --name $KIND_CLUSTER || echo "Failed to load redis image"
88
+
89
+ echo "Pre-pulling Onyx images..."
90
+ docker pull docker.io/onyxdotapp/onyx-web-server:latest || echo "Failed to pull onyx web server"
91
+ docker pull docker.io/onyxdotapp/onyx-backend:latest || echo "Failed to pull onyx backend"
92
+ kind load docker-image docker.io/onyxdotapp/onyx-web-server:latest --name $KIND_CLUSTER || echo "Failed to load onyx web server"
93
+ kind load docker-image docker.io/onyxdotapp/onyx-backend:latest --name $KIND_CLUSTER || echo "Failed to load onyx backend"
94
+
95
+ echo "=== Images loaded into Kind cluster ==="
96
+ docker exec $KIND_CLUSTER-control-plane crictl images | grep -E "(postgres|redis|onyx)" || echo "Some images may still be loading..."
97
+
98
+ - name : Validate chart dependencies
99
+ if : steps.list-changed.outputs.changed == 'true'
100
+ run : |
101
+ echo "=== Validating chart dependencies ==="
102
+ cd deployment/helm/charts/onyx
103
+ helm dependency update
104
+ helm lint .
105
+
106
+ - name : Run chart-testing (install) with enhanced monitoring
107
+ timeout-minutes : 25
108
+ if : steps.list-changed.outputs.changed == 'true'
109
+ run : |
110
+ echo "=== Starting chart installation with monitoring ==="
111
+
112
+ # Function to monitor cluster state
113
+ monitor_cluster() {
114
+ while true; do
115
+ echo "=== Cluster Status Check at $(date) ==="
116
+ # Only show non-running pods to reduce noise
117
+ NON_RUNNING_PODS=$(kubectl get pods --all-namespaces --field-selector=status.phase!=Running,status.phase!=Succeeded --no-headers 2>/dev/null | wc -l)
118
+ if [ "$NON_RUNNING_PODS" -gt 0 ]; then
119
+ echo "Non-running pods:"
120
+ kubectl get pods --all-namespaces --field-selector=status.phase!=Running,status.phase!=Succeeded
121
+ else
122
+ echo "All pods running successfully"
123
+ fi
124
+ # Only show recent events if there are issues
125
+ RECENT_EVENTS=$(kubectl get events --sort-by=.lastTimestamp --all-namespaces --field-selector=type!=Normal 2>/dev/null | tail -5)
126
+ if [ -n "$RECENT_EVENTS" ]; then
127
+ echo "Recent warnings/errors:"
128
+ echo "$RECENT_EVENTS"
129
+ fi
130
+ sleep 60
131
+ done
132
+ }
133
+
134
+ # Start monitoring in background
135
+ monitor_cluster &
136
+ MONITOR_PID=$!
137
+
138
+ # Set up cleanup
139
+ cleanup() {
140
+ echo "=== Cleaning up monitoring process ==="
141
+ kill $MONITOR_PID 2>/dev/null || true
142
+ echo "=== Final cluster state ==="
143
+ kubectl get pods --all-namespaces
144
+ kubectl get events --all-namespaces --sort-by=.lastTimestamp | tail -20
145
+ }
146
+
147
+ # Trap cleanup on exit
148
+ trap cleanup EXIT
149
+
150
+ # Run the actual installation with detailed logging
151
+ echo "=== Starting ct install ==="
152
+ ct install --all \
153
+ --helm-extra-set-args="\
154
+ --set=nginx.enabled=false \
155
+ --set=minio.enabled=false \
156
+ --set=vespa.enabled=false \
157
+ --set=slackbot.enabled=false \
158
+ --set=postgresql.enabled=true \
159
+ --set=postgresql.primary.persistence.enabled=false \
160
+ --set=redis.enabled=true \
161
+ --set=webserver.replicaCount=1 \
162
+ --set=api.replicaCount=0 \
163
+ --set=inferenceCapability.replicaCount=0 \
164
+ --set=indexCapability.replicaCount=0 \
165
+ --set=celery_beat.replicaCount=0 \
166
+ --set=celery_worker_heavy.replicaCount=0 \
167
+ --set=celery_worker_docfetching.replicaCount=0 \
168
+ --set=celery_worker_docprocessing.replicaCount=0 \
169
+ --set=celery_worker_light.replicaCount=0 \
170
+ --set=celery_worker_monitoring.replicaCount=0 \
171
+ --set=celery_worker_primary.replicaCount=0 \
172
+ --set=celery_worker_user_files_indexing.replicaCount=0" \
173
+ --helm-extra-args="--timeout 900s --debug" \
174
+ --debug --config ct.yaml
175
+
176
+ echo "=== Installation completed successfully ==="
177
+ kubectl get pods --all-namespaces
178
+
179
+ - name : Post-install verification
180
+ if : steps.list-changed.outputs.changed == 'true'
181
+ run : |
182
+ echo "=== Post-install verification ==="
183
+ kubectl get pods --all-namespaces
184
+ kubectl get services --all-namespaces
185
+ # Only show issues if they exist
186
+ kubectl describe pods --all-namespaces | grep -A 5 -B 2 "Failed\|Error\|Warning" || echo "No pod issues found"
187
+
188
+ - name : Cleanup on failure
189
+ if : failure() && steps.list-changed.outputs.changed == 'true'
190
+ run : |
191
+ echo "=== Cleanup on failure ==="
192
+ echo "=== Final cluster state ==="
193
+ kubectl get pods --all-namespaces
194
+ kubectl get events --all-namespaces --sort-by=.lastTimestamp | tail -10
195
+
196
+ echo "=== Pod descriptions for debugging ==="
197
+ kubectl describe pods --all-namespaces | grep -A 10 -B 3 "Failed\|Error\|Warning\|Pending" || echo "No problematic pods found"
198
+
199
+ echo "=== Recent logs for debugging ==="
200
+ kubectl logs --all-namespaces --tail=50 | grep -i "error\|timeout\|failed\|pull" || echo "No error logs found"
201
+
202
+ echo "=== Helm releases ==="
203
+ helm list --all-namespaces
77
204
# the following would install only changed charts, but we only have one chart so
78
205
# don't worry about that for now
79
206
# run: ct install --target-branch ${{ github.event.repository.default_branch }}
0 commit comments