Skip to content

Commit 2e2ff99

Browse files
committed
Merge stackhpc/yoga into stackhpc/zed
2 parents 0f6c73e + 410e54b commit 2e2ff99

File tree

6 files changed

+248
-6
lines changed

6 files changed

+248
-6
lines changed

.github/workflows/stackhpc-all-in-one.yml

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,17 @@ jobs:
137137
run: ssh-keygen -f id_rsa -N ''
138138
working-directory: ${{ github.workspace }}/terraform/aio
139139

140+
# TODO: Remove the following step in Antelope.
141+
# NOTE: In Ansible 2.10 and lower the synchronize module used in the
142+
# ansible/diagnostics.yml playbook does not respect SSH connection
143+
# variables. This may result in Permission Denied issues if using an SSH
144+
# key that is not in ~/.ssh.
145+
- name: Copy SSH keypair to .ssh/
146+
run: |
147+
install -d ~/.ssh -m 700 &&
148+
cp id_rsa* ~/.ssh/
149+
working-directory: ${{ github.workspace }}/terraform/aio
150+
140151
- name: Generate clouds.yaml
141152
run: |
142153
cat << EOF > clouds.yaml
@@ -179,6 +190,7 @@ jobs:
179190
OS_APPLICATION_CREDENTIAL_SECRET: ${{ secrets.OS_APPLICATION_CREDENTIAL_SECRET }}
180191

181192
- name: Terraform Apply
193+
id: tf_apply
182194
run: |
183195
for attempt in $(seq 5); do
184196
if terraform apply -auto-approve; then
@@ -355,6 +367,7 @@ jobs:
355367
if: inputs.upgrade
356368

357369
- name: Tempest tests
370+
id: tempest
358371
run: |
359372
mkdir -p tempest-artifacts
360373
docker run -t --rm \
@@ -366,11 +379,28 @@ jobs:
366379
env:
367380
KAYOBE_AUTOMATION_SSH_PRIVATE_KEY: ${{ steps.ssh_key.outputs.ssh_key }}
368381

382+
- name: Collect diagnostic information
383+
id: diagnostics
384+
run: |
385+
mkdir -p diagnostics
386+
sudo -E docker run -t --rm \
387+
-v $(pwd):/stack/kayobe-automation-env/src/kayobe-config \
388+
-v $(pwd)/diagnostics:/stack/diagnostics \
389+
-e KAYOBE_ENVIRONMENT -e KAYOBE_VAULT_PASSWORD -e KAYOBE_AUTOMATION_SSH_PRIVATE_KEY \
390+
$KAYOBE_IMAGE \
391+
/stack/kayobe-automation-env/src/kayobe-config/.automation/pipeline/playbook-run.sh '$KAYOBE_CONFIG_PATH/ansible/diagnostics.yml'
392+
env:
393+
KAYOBE_AUTOMATION_SSH_PRIVATE_KEY: ${{ steps.ssh_key.outputs.ssh_key }}
394+
if: ${{ always() && steps.tf_apply.outcome == 'success' }}
395+
369396
- name: Upload test result artifacts
370397
uses: actions/upload-artifact@v4
371398
with:
372-
name: tempest-results-${{ inputs.os_distribution }}-${{ inputs.os_release }}-${{ inputs.neutron_plugin }}${{ inputs.upgrade && '-upgrade' }}
373-
path: tempest-artifacts/*
399+
name: test-results-${{ inputs.os_distribution }}-${{ inputs.os_release }}-${{ inputs.neutron_plugin }}${{ inputs.upgrade && '-upgrade' }}
400+
path: |
401+
diagnostics/
402+
tempest-artifacts/
403+
if: ${{ always() && (steps.tempest.outcome == 'success' || steps.diagnostics.outcome == 'success') }}
374404

375405
- name: Fail if any Tempest tests failed
376406
run: |

doc/source/configuration/monitoring.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ on the overcloud hosts:
8585
.. code-block:: console
8686
8787
(kayobe) [stack@node ~]$ cd etc/kayobe
88-
(kayobe) [stack@node kayobe]$ kayobe playbook run ansible/smartmontools.yml
88+
(kayobe) [stack@node kayobe]$ kayobe playbook run ansible/smartmon-tools.yml
8989
9090
SMART reporting should now be enabled along with a Prometheus alert for
9191
unhealthy disks and a Grafana dashboard called ``Hardware Overview``.

etc/kayobe/ansible/diagnostics.yml

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
---
2+
# This playbook runs a script that collects diagnostic information from hosts.
3+
# The diagnostics are aggregated to a directory
4+
# (diagnostics_path_local/inventory_hostname) on localhost.
5+
#
6+
# NOTE: The diagnostic information contains sensitive information such as
7+
# passwords in configuration files.
8+
9+
- name: Collect diagnostic information
10+
hosts: seed-hypervisor:seed:overcloud:infra-vms
11+
vars:
12+
diagnostics_path_local: "{{ lookup('env', 'PWD') }}/diagnostics"
13+
tasks:
14+
- block:
15+
- name: Create a temporary directory for diagnostics
16+
ansible.builtin.tempfile:
17+
state: directory
18+
suffix: diagnostics
19+
register: diagnostics_tmpdir
20+
21+
- name: Write host variables to a file
22+
ansible.builtin.copy:
23+
content: "{{ hostvars[inventory_hostname].ansible_facts | to_nice_json }}"
24+
dest: "{{ diagnostics_tmpdir.path }}/facts.json"
25+
26+
- name: Run diagnostics script
27+
ansible.builtin.script: "{{ kayobe_config_path }}/../../tools/diagnostics.sh"
28+
become: true
29+
failed_when: diagnostics_result.rc is not defined
30+
register: diagnostics_result
31+
environment:
32+
LOG_DIR: "{{ diagnostics_tmpdir.path }}"
33+
CONFIG_DIR: "{{ kayobe_config_path }}/../.."
34+
35+
- name: Download diagnostic logs to localhost
36+
ansible.posix.synchronize:
37+
src: "{{ diagnostics_tmpdir.path }}/"
38+
dest: "{{ diagnostics_path_local }}/{{ inventory_hostname }}"
39+
mode: pull
40+
archive: no
41+
recursive: true
42+
copy_links: true
43+
verify_host: true
44+
# For jump host
45+
use_ssh_args: true
46+
vars:
47+
# FIXME: The synchronize module fails on Yoga, due to not templating
48+
# the SSH user.
49+
ansible_user: stack
50+
always:
51+
- name: Clean up temporary directory
52+
ansible.builtin.file:
53+
path: "{{ diagnostics_tmpdir.path }}"
54+
state: absent
55+
56+
- name: Display diagnostics collection stdout
57+
ansible.builtin.debug:
58+
msg: "{{ diagnostics_result.stdout }}"
59+
when: diagnostics_result.stdout is defined
60+
61+
- name: Display diagnostics collection stderr
62+
ansible.builtin.debug:
63+
msg: "{{ diagnostics_result.stderr }}"
64+
when: diagnostics_result.stderr is defined
65+
66+
- name: Fail if diagnostics collection failed
67+
ansible.builtin.fail:
68+
msg: Diagnostics collection failed
69+
when: diagnostics_result.rc != 0
70+
71+
- name: Display location of diagnostics archive
72+
ansible.builtin.debug:
73+
msg: >-
74+
Wrote diagnostics to {{ diagnostics_path_local }} on localhost

etc/kayobe/environments/aufn-ceph/tenks.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,9 @@ node_types:
2121
volumes:
2222
# There is a minimum disk space capacity requirement of 4GiB when using Ironic Python Agent:
2323
# https://github.com/openstack/ironic-python-agent/blob/master/ironic_python_agent/utils.py#L290
24-
- capacity: 10GiB
24+
- capacity: 15GiB
2525
# Ceph volume
26-
- capacity: 10GiB
26+
- capacity: 20GiB
2727
physical_networks:
2828
- provision-net
2929
- cloud-net
@@ -34,7 +34,7 @@ node_types:
3434
volumes:
3535
# There is a minimum disk space capacity requirement of 4GiB when using Ironic Python Agent:
3636
# https://github.com/openstack/ironic-python-agent/blob/master/ironic_python_agent/utils.py#L290
37-
- capacity: 10GiB
37+
- capacity: 15GiB
3838
physical_networks:
3939
- provision-net
4040
- cloud-net
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
---
2+
features:
3+
- |
4+
Adds a new ``diagnostics.yml`` playbook that collects diagnostic
5+
information from hosts. The diagnostics are aggregated to a directory
6+
(``$PWD/diagnostics/`` by default) on localhost. The diagnostics include:
7+
8+
* Docker container logs
9+
* Kolla configuration files
10+
* Log files
11+
12+
*The collected diagnostic information contains sensitive information such
13+
as passwords in configuration files.*
14+

tools/diagnostics.sh

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
#!/bin/bash
2+
3+
# NOTE(mgoddard): This has been adapted from
4+
# roles/kayobe-diagnostics/files/get_logs.sh in Kayobe.
5+
6+
# Environment variables:
7+
# $LOG_DIR is the directory to copy logs to.
8+
9+
# TODO: Make this script more robust and use set -e.
10+
set +o errexit
11+
set -u
12+
13+
copy_logs() {
14+
mkdir -p ${LOG_DIR}/{docker_logs,kolla_node_configs,system_logs}
15+
16+
cp -rnL /etc/kolla/* ${LOG_DIR}/kolla_node_configs
17+
# Don't save the IPA images.
18+
rm ${LOG_DIR}/kolla_node_configs/ironic-http/ironic-agent.{kernel,initramfs}
19+
rm ${LOG_DIR}/kolla_node_configs/ironic-tftp/ironic-agent.{kernel,initramfs}
20+
21+
if [[ -d /opt/kayobe/etc/kolla ]]; then
22+
mkdir -p ${LOG_DIR}/kolla_build_configs
23+
cp -rnL /opt/kayobe/etc/kolla/* ${LOG_DIR}/kolla_build_configs/
24+
fi
25+
26+
cp -rvnL /var/log/* ${LOG_DIR}/system_logs/
27+
28+
journalctl --no-pager > ${LOG_DIR}/system_logs/syslog.log
29+
journalctl --no-pager -u docker.service > ${LOG_DIR}/system_logs/docker.log
30+
journalctl --no-pager -u vbmcd.service > ${LOG_DIR}/system_logs/vbmcd.log
31+
journalctl --no-pager -u NetworkManager.service > ${LOG_DIR}/system_logs/NetworkManager.log
32+
33+
if [[ -d /etc/sysconfig/network-scripts/ ]]; then
34+
cp -r /etc/sysconfig/network-scripts/ ${LOG_DIR}/system_logs/
35+
fi
36+
37+
if [[ -d /etc/NetworkManager/system-connections/ ]]; then
38+
cp -r /etc/NetworkManager/system-connections/ ${LOG_DIR}/system_logs/
39+
fi
40+
41+
if [[ -d /etc/yum.repos.d/ ]]; then
42+
cp -r /etc/yum.repos.d/ ${LOG_DIR}/system_logs/
43+
fi
44+
45+
if [[ -d /etc/apt/sources.list.d/ ]]; then
46+
cp -r /etc/apt/sources.list.d/ ${LOG_DIR}/system_logs/
47+
fi
48+
49+
if [[ -d /etc/systemd/ ]]; then
50+
cp -rL /etc/systemd/ ${LOG_DIR}/system_logs/
51+
fi
52+
53+
df -h > ${LOG_DIR}/system_logs/df.txt
54+
# Gather disk usage statistics for files and directories larger than 1MB
55+
du -d 5 -hx / | sort -hr | grep '^[0-9\.]*[MGT]' > ${LOG_DIR}/system_logs/du.txt
56+
free > ${LOG_DIR}/system_logs/free.txt
57+
cat /etc/hosts > ${LOG_DIR}/system_logs/hosts.txt
58+
parted -l > ${LOG_DIR}/system_logs/parted-l.txt
59+
mount > ${LOG_DIR}/system_logs/mount.txt
60+
env > ${LOG_DIR}/system_logs/env.txt
61+
ip address > ${LOG_DIR}/system_logs/ip-address.txt
62+
ip route > ${LOG_DIR}/system_logs/ip-route.txt
63+
ip route show table all > ${LOG_DIR}/system_logs/ip-route-all-tables.txt
64+
ip rule list > ${LOG_DIR}/system_logs/ip-rule-list.txt
65+
pvs > ${LOG_DIR}/system_logs/pvs.txt
66+
vgs > ${LOG_DIR}/system_logs/vgs.txt
67+
lvs > ${LOG_DIR}/system_logs/lvs.txt
68+
69+
iptables-save > ${LOG_DIR}/system_logs/iptables.txt
70+
71+
if [ `command -v dpkg` ]; then
72+
dpkg -l > ${LOG_DIR}/system_logs/dpkg-l.txt
73+
fi
74+
if [ `command -v rpm` ]; then
75+
rpm -qa > ${LOG_DIR}/system_logs/rpm-qa.txt
76+
fi
77+
78+
# final memory usage and process list
79+
ps -eo user,pid,ppid,lwp,%cpu,%mem,size,rss,cmd > ${LOG_DIR}/system_logs/ps.txt
80+
81+
# available entropy
82+
cat /proc/sys/kernel/random/entropy_avail > ${LOG_DIR}/system_logs/entropy_avail.txt
83+
84+
# docker related information
85+
(docker info && docker images && docker ps -a) > ${LOG_DIR}/system_logs/docker-info.txt
86+
87+
for container in $(docker ps -a --format "{{.Names}}"); do
88+
docker logs --tail all ${container} &> ${LOG_DIR}/docker_logs/${container}.txt
89+
done
90+
91+
# Bifrost: grab config files and logs from the container.
92+
if [[ $(docker ps -q -f name=bifrost_deploy) ]]; then
93+
mkdir -p ${LOG_DIR}/bifrost
94+
for service in dnsmasq ironic-api ironic-conductor ironic-inspector mariadb nginx rabbitmq-server; do
95+
mkdir -p ${LOG_DIR}/bifrost/$service
96+
docker exec bifrost_deploy \
97+
systemctl status $service -l -n 10000 > ${LOG_DIR}/bifrost/$service/${service}-systemd-status.txt
98+
docker exec bifrost_deploy \
99+
journalctl -u $service --no-pager > ${LOG_DIR}/bifrost/$service/${service}-journal.txt
100+
done
101+
docker exec -it bifrost_deploy \
102+
journalctl --no-pager > ${LOG_DIR}/bifrost/bifrost-journal.log
103+
for d in dnsmasq.conf ironic ironic-inspector nginx/nginx.conf; do
104+
docker cp bifrost_deploy:/etc/$d ${LOG_DIR}/kolla_node_configs/bifrost/
105+
done
106+
docker cp bifrost_deploy:/var/log/mariadb/mariadb.log ${LOG_DIR}/bifrost/mariadb/
107+
fi
108+
109+
# IPA build logs
110+
if [[ -f /opt/kayobe/images/ipa/ipa.stderr ]] || [[ -f /opt/kayobe/images/ipa/ipa.stdout ]]; then
111+
mkdir -p ${LOG_DIR}/ipa
112+
cp /opt/kayobe/images/ipa/ipa.stderr /opt/kayobe/images/ipa/ipa.stdout ${LOG_DIR}/ipa/
113+
fi
114+
115+
# Overcloud host image build logs
116+
if [[ -f /opt/kayobe/images/deployment_image/deployment_image.stderr ]] || [[ -f /opt/kayobe/images/deployment_image/deployment_image.stdout ]]; then
117+
mkdir -p ${LOG_DIR}/deployment_image
118+
cp /opt/kayobe/images/deployment_image/deployment_image.stderr /opt/kayobe/images/deployment_image/deployment_image.stdout ${LOG_DIR}/deployment_image/
119+
fi
120+
121+
chown -R stack: ${LOG_DIR}
122+
}
123+
124+
copy_logs

0 commit comments

Comments
 (0)