Skip to content

Commit 54e83d7

Browse files
Merge pull request #604 from ClickHouse/revert-599-egork/incident-2025-08-31-services-in-aws-fail-after-provisioning
Revert "debug: diagnostics for incident 2025-08-31"
2 parents 5491805 + d43da3f commit 54e83d7

File tree

3 files changed

+12
-77
lines changed

3 files changed

+12
-77
lines changed
Lines changed: 5 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,6 @@
11
name: "Run ClickBench on ClickHouse Cloud"
22
on:
33
workflow_dispatch: # This allows manual trigger from the UI
4-
inputs:
5-
max_parallel:
6-
description: "Max concurrent services (0 = no cap)"
7-
required: false
8-
default: "0"
94
schedule:
105
- cron: '10 10 * * *'
116

@@ -18,7 +13,6 @@ jobs:
1813
env:
1914
CI_COMMIT_MESSAGE: "[bot] update results for ClickHouse Cloud"
2015
CI_COMMIT_AUTHOR: github
21-
MAX_PARALLEL: ${{ github.event.inputs.max_parallel || '0' }}
2216
steps:
2317
- uses: actions/checkout@v3
2418
- run: |
@@ -30,52 +24,21 @@ jobs:
3024
echo "Required secrets are not set. Skipping workflow."
3125
exit 0
3226
fi
27+
3328
cd clickhouse-cloud
3429
curl https://clickhouse.com/ | sh
3530
sudo ./clickhouse install -y
36-
- name: Install diagnostics tools
37-
run: |
38-
sudo apt-get update -y
39-
sudo DEBIAN_FRONTEND=noninteractive apt-get install -y sysstat iproute2 traceroute moreutils procps dnsutils
40-
- name: Start diagnostics collectors
41-
working-directory: clickhouse-cloud
42-
run: |
43-
mkdir -p diag
44-
# CPU usage per core every second
45-
(mpstat -P ALL 1 > diag/mpstat.log 2>&1 &)
46-
# System runq/mem/io snapshot every second
47-
(vmstat 1 > diag/vmstat.log 2>&1 &)
48-
# Network device and TCP stats every second (ignore if sar not available)
49-
(sar -n DEV 1 > diag/sar_net_dev.log 2>&1 &) || true
50-
(sar -n TCP,ETCP 1 > diag/sar_tcp.log 2>&1 &) || true
51-
# Socket summary snapshot once per second
52-
(for i in $(seq 1 3600); do date +%FT%T%z >> diag/ss.log; ss -s >> diag/ss.log; sleep 1; done) &
5331
5432
bash combinations.sh
5533
bash collect-results.sh
5634
57-
- name: Upload diagnostics and raw run outputs
58-
uses: actions/upload-artifact@v4
59-
with:
60-
name: clickhouse-cloud-diag-${{ github.run_id }}
61-
path: |
62-
clickhouse-cloud/diag/**
63-
clickhouse-cloud/csp-*/netdebug.txt
64-
clickhouse-cloud/csp-*/chclient_errors.log
65-
clickhouse-cloud/csp-*/result
66-
clickhouse-cloud/csp-*/state
67-
clickhouse-cloud/results/*.json
68-
if-no-files-found: ignore
69-
- name: Commit results
70-
working-directory: clickhouse-cloud
71-
run: |
7235
git config --global user.name "${{ env.CI_COMMIT_AUTHOR }}"
7336
git config --global user.email "${{ env.CI_COMMIT_AUTHOR }}@users.noreply.github.com"
37+
7438
git pull
75-
git add results/*.json || true
76-
if git status --porcelain | grep -qE "^ M|^A |^\?\? "; then
39+
git add results/*.json
40+
if git status | grep -q modified
41+
then
7742
git commit -m "${{ env.CI_COMMIT_MESSAGE }}"
7843
git push
79-
else
80-
echo "No changes to commit."
8144
fi

clickhouse-cloud/cloud-api.sh

Lines changed: 1 addition & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -86,27 +86,9 @@ done
8686

8787
echo "Waiting for clickhouse-server to start"
8888

89-
# Collect quick network/TLS diagnostics for this run
90-
{
91-
echo "==== NET DEBUG $(date -Iseconds) ===="
92-
echo "Host: $(hostname)"
93-
echo "Kernel: $(uname -a)"
94-
echo "ulimit -n: $(ulimit -n 2>/dev/null || echo n/a)"
95-
echo "FQDN: ${FQDN}"
96-
echo "IP route:"; ip route || true
97-
echo "IP addr:"; ip -brief address || ip addr || true
98-
echo "DNS resolution for ${FQDN}:"; getent hosts "${FQDN}" || nslookup "${FQDN}" 2>&1 || host "${FQDN}" 2>&1 || true
99-
echo "Socket summary:"; ss -s || true
100-
echo "Public IP (best-effort):"; curl -fsS --max-time 3 https://ifconfig.me || echo "n/a"
101-
echo "OpenSSL s_client (short):"; timeout 10s openssl s_client -connect "${FQDN}:9440" -servername "${FQDN}" -brief </dev/null 2>&1 || true
102-
echo "Traceroute TCP:9440:"; timeout 20s traceroute -T -p 9440 -n "${FQDN}" 2>&1 || traceroute -n "${FQDN}" 2>&1 || true
103-
} > "${TMPDIR}/netdebug.txt" 2>&1
104-
10589
for i in {1..1000}
10690
do
107-
if clickhouse-client --host "$FQDN" --password "$PASSWORD" --secure --query "SELECT 1" 1>/dev/null 2>>"${TMPDIR}/chclient_errors.log"; then
108-
break
109-
fi
91+
clickhouse-client --host "$FQDN" --password "$PASSWORD" --secure --query "SELECT 1" && break
11092
sleep 1
11193
if [[ $i == 1000 ]]
11294
then

clickhouse-cloud/combinations.sh

Lines changed: 6 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -8,22 +8,12 @@ PROVIDER=aws
88
REGION='us-east-1'
99
PARALLEL_REPLICA=false
1010

11-
# Optional cap on parallel jobs; set MAX_PARALLEL=N in the environment to enable.
12-
throttle() {
13-
local max=${MAX_PARALLEL:-0}
14-
if [[ -n "$max" && "$max" =~ ^[0-9]+$ && $max -gt 0 ]]; then
15-
while [[ $(jobs -rp | wc -l) -ge $max ]]; do
16-
sleep 1
17-
done
18-
fi
19-
}
20-
2111
for REPLICAS in 1
2212
do
2313
for MEMORY in 8 12
2414
do
2515
export PROVIDER REPLICAS REGION MEMORY PARALLEL_REPLICA
26-
throttle; ./cloud-api.sh &
16+
./cloud-api.sh &
2717
sleep 10 # Prevent "Too many requests" to the API
2818
done
2919
done
@@ -33,7 +23,7 @@ do
3323
for MEMORY in 8 12 16 32 64 120 236
3424
do
3525
export PROVIDER REPLICAS REGION MEMORY PARALLEL_REPLICA
36-
throttle; ./cloud-api.sh &
26+
./cloud-api.sh &
3727
sleep 10
3828
done
3929
done
@@ -46,7 +36,7 @@ do
4636
for MEMORY in 8 12
4737
do
4838
export PROVIDER REPLICAS REGION MEMORY PARALLEL_REPLICA
49-
throttle; ./cloud-api.sh &
39+
./cloud-api.sh &
5040
sleep 10
5141
done
5242
done
@@ -56,7 +46,7 @@ do
5646
for MEMORY in 8 12 16 32 64 120 236
5747
do
5848
export PROVIDER REPLICAS REGION MEMORY PARALLEL_REPLICA
59-
throttle; ./cloud-api.sh &
49+
./cloud-api.sh &
6050
sleep 10
6151
done
6252
done
@@ -69,7 +59,7 @@ do
6959
for MEMORY in 8 12
7060
do
7161
export PROVIDER REPLICAS REGION MEMORY PARALLEL_REPLICA
72-
throttle; ./cloud-api.sh &
62+
./cloud-api.sh &
7363
sleep 10
7464
done
7565
done
@@ -79,7 +69,7 @@ do
7969
for MEMORY in 8 12 16 32 64 120
8070
do
8171
export PROVIDER REPLICAS REGION MEMORY PARALLEL_REPLICA
82-
throttle; ./cloud-api.sh &
72+
./cloud-api.sh &
8373
sleep 10
8474
done
8575
done

0 commit comments

Comments
 (0)