diff --git a/.github/workflows/stackhpc-all-in-one.yml b/.github/workflows/stackhpc-all-in-one.yml index d9bdf2fa0..3206a2cd9 100644 --- a/.github/workflows/stackhpc-all-in-one.yml +++ b/.github/workflows/stackhpc-all-in-one.yml @@ -179,6 +179,7 @@ jobs: OS_APPLICATION_CREDENTIAL_SECRET: ${{ secrets.OS_APPLICATION_CREDENTIAL_SECRET }} - name: Terraform Apply + id: tf_apply run: | for attempt in $(seq 5); do if terraform apply -auto-approve; then @@ -355,6 +356,7 @@ jobs: if: inputs.upgrade - name: Tempest tests + id: tempest run: | mkdir -p tempest-artifacts docker run -t --rm \ @@ -380,13 +382,29 @@ jobs: env: KAYOBE_AUTOMATION_SSH_PRIVATE_KEY: ${{ steps.ssh_key.outputs.ssh_key }} + - name: Collect diagnostic information + id: diagnostics + run: | + mkdir -p diagnostics + sudo -E docker run -t --rm \ + -v $(pwd):/stack/kayobe-automation-env/src/kayobe-config \ + -v $(pwd)/diagnostics:/stack/diagnostics \ + -e KAYOBE_ENVIRONMENT -e KAYOBE_VAULT_PASSWORD -e KAYOBE_AUTOMATION_SSH_PRIVATE_KEY \ + $KAYOBE_IMAGE \ + /stack/kayobe-automation-env/src/kayobe-config/.automation/pipeline/playbook-run.sh '$KAYOBE_CONFIG_PATH/ansible/diagnostics.yml' + env: + KAYOBE_AUTOMATION_SSH_PRIVATE_KEY: ${{ steps.ssh_key.outputs.ssh_key }} + if: ${{ !cancelled() && steps.tf_apply.outcome == 'success' }} + - name: Upload test result artifacts uses: actions/upload-artifact@v4 with: name: test-results-${{ inputs.os_distribution }}-${{ inputs.os_release }}-${{ inputs.neutron_plugin }}${{ inputs.upgrade && '-upgrade' || '' }} path: | + diagnostics/ tempest-artifacts/ sot-results/ + if: ${{ !cancelled() && (steps.tempest.outcome == 'success' || steps.stackhpc-openstack-tests.outcome == 'success' || steps.diagnostics.outcome == 'success') }} - name: Fail if any Tempest tests failed run: | diff --git a/etc/kayobe/ansible/diagnostics.yml b/etc/kayobe/ansible/diagnostics.yml new file mode 100644 index 000000000..7764698cb --- /dev/null +++ b/etc/kayobe/ansible/diagnostics.yml @@ -0,0 +1,70 @@ +--- +# This playbook runs a script that collects diagnostic information from hosts. +# The diagnostics are aggregated to a directory +# (diagnostics_path_local/inventory_hostname) on localhost. +# +# NOTE: The diagnostic information contains sensitive information such as +# passwords in configuration files. + +- name: Collect diagnostic information + hosts: seed-hypervisor:seed:overcloud:infra-vms + vars: + diagnostics_path_local: "{{ lookup('env', 'PWD') }}/diagnostics" + tasks: + - block: + - name: Create a temporary directory for diagnostics + ansible.builtin.tempfile: + state: directory + suffix: diagnostics + register: diagnostics_tmpdir + + - name: Write host variables to a file + ansible.builtin.copy: + content: "{{ hostvars[inventory_hostname].ansible_facts | to_nice_json }}" + dest: "{{ diagnostics_tmpdir.path }}/facts.json" + + - name: Run diagnostics script + ansible.builtin.script: "{{ kayobe_config_path }}/../../tools/diagnostics.sh" + become: true + failed_when: diagnostics_result.rc is not defined + register: diagnostics_result + environment: + LOG_DIR: "{{ diagnostics_tmpdir.path }}" + CONFIG_DIR: "{{ kayobe_config_path }}/../.." + + - name: Download diagnostic logs to localhost + ansible.posix.synchronize: + src: "{{ diagnostics_tmpdir.path }}/" + dest: "{{ diagnostics_path_local }}/{{ inventory_hostname }}" + mode: pull + archive: no + recursive: true + copy_links: true + verify_host: true + # For jump host + use_ssh_args: true + always: + - name: Clean up temporary directory + ansible.builtin.file: + path: "{{ diagnostics_tmpdir.path }}" + state: absent + + - name: Display diagnostics collection stdout + ansible.builtin.debug: + msg: "{{ diagnostics_result.stdout }}" + when: diagnostics_result.stdout is defined + + - name: Display diagnostics collection stderr + ansible.builtin.debug: + msg: "{{ diagnostics_result.stderr }}" + when: diagnostics_result.stderr is defined + + - name: Fail if diagnostics collection failed + ansible.builtin.fail: + msg: Diagnostics collection failed + when: diagnostics_result.rc != 0 + + - name: Display location of diagnostics archive + ansible.builtin.debug: + msg: >- + Wrote diagnostics to {{ diagnostics_path_local }} on localhost diff --git a/releasenotes/notes/diagnostics-378a6693a64d0b3c.yaml b/releasenotes/notes/diagnostics-378a6693a64d0b3c.yaml new file mode 100644 index 000000000..4e8d45ee7 --- /dev/null +++ b/releasenotes/notes/diagnostics-378a6693a64d0b3c.yaml @@ -0,0 +1,14 @@ +--- +features: + - | + Adds a new ``diagnostics.yml`` playbook that collects diagnostic + information from hosts. The diagnostics are aggregated to a directory + (``$PWD/diagnostics/`` by default) on localhost. The diagnostics include: + + * Docker container logs + * Kolla configuration files + * Log files + + *The collected diagnostic information contains sensitive information such + as passwords in configuration files.* + diff --git a/tools/diagnostics.sh b/tools/diagnostics.sh new file mode 100644 index 000000000..73d61775a --- /dev/null +++ b/tools/diagnostics.sh @@ -0,0 +1,124 @@ +#!/bin/bash + +# NOTE(mgoddard): This has been adapted from +# roles/kayobe-diagnostics/files/get_logs.sh in Kayobe. + +# Environment variables: +# $LOG_DIR is the directory to copy logs to. + +# TODO: Make this script more robust and use set -e. +set +o errexit +set -u + +copy_logs() { + mkdir -p ${LOG_DIR}/{docker_logs,kolla_node_configs,system_logs} + + cp -rnL /etc/kolla/* ${LOG_DIR}/kolla_node_configs + # Don't save the IPA images. + rm ${LOG_DIR}/kolla_node_configs/ironic-http/ironic-agent.{kernel,initramfs} + rm ${LOG_DIR}/kolla_node_configs/ironic-tftp/ironic-agent.{kernel,initramfs} + + if [[ -d /opt/kayobe/etc/kolla ]]; then + mkdir -p ${LOG_DIR}/kolla_build_configs + cp -rnL /opt/kayobe/etc/kolla/* ${LOG_DIR}/kolla_build_configs/ + fi + + cp -rvnL /var/log/* ${LOG_DIR}/system_logs/ + + journalctl --no-pager > ${LOG_DIR}/system_logs/syslog.log + journalctl --no-pager -u docker.service > ${LOG_DIR}/system_logs/docker.log + journalctl --no-pager -u vbmcd.service > ${LOG_DIR}/system_logs/vbmcd.log + journalctl --no-pager -u NetworkManager.service > ${LOG_DIR}/system_logs/NetworkManager.log + + if [[ -d /etc/sysconfig/network-scripts/ ]]; then + cp -r /etc/sysconfig/network-scripts/ ${LOG_DIR}/system_logs/ + fi + + if [[ -d /etc/NetworkManager/system-connections/ ]]; then + cp -r /etc/NetworkManager/system-connections/ ${LOG_DIR}/system_logs/ + fi + + if [[ -d /etc/yum.repos.d/ ]]; then + cp -r /etc/yum.repos.d/ ${LOG_DIR}/system_logs/ + fi + + if [[ -d /etc/apt/sources.list.d/ ]]; then + cp -r /etc/apt/sources.list.d/ ${LOG_DIR}/system_logs/ + fi + + if [[ -d /etc/systemd/ ]]; then + cp -rL /etc/systemd/ ${LOG_DIR}/system_logs/ + fi + + df -h > ${LOG_DIR}/system_logs/df.txt + # Gather disk usage statistics for files and directories larger than 1MB + du -d 5 -hx / | sort -hr | grep '^[0-9\.]*[MGT]' > ${LOG_DIR}/system_logs/du.txt + free > ${LOG_DIR}/system_logs/free.txt + cat /etc/hosts > ${LOG_DIR}/system_logs/hosts.txt + parted -l > ${LOG_DIR}/system_logs/parted-l.txt + mount > ${LOG_DIR}/system_logs/mount.txt + env > ${LOG_DIR}/system_logs/env.txt + ip address > ${LOG_DIR}/system_logs/ip-address.txt + ip route > ${LOG_DIR}/system_logs/ip-route.txt + ip route show table all > ${LOG_DIR}/system_logs/ip-route-all-tables.txt + ip rule list > ${LOG_DIR}/system_logs/ip-rule-list.txt + pvs > ${LOG_DIR}/system_logs/pvs.txt + vgs > ${LOG_DIR}/system_logs/vgs.txt + lvs > ${LOG_DIR}/system_logs/lvs.txt + + iptables-save > ${LOG_DIR}/system_logs/iptables.txt + + if [ `command -v dpkg` ]; then + dpkg -l > ${LOG_DIR}/system_logs/dpkg-l.txt + fi + if [ `command -v rpm` ]; then + rpm -qa > ${LOG_DIR}/system_logs/rpm-qa.txt + fi + + # final memory usage and process list + ps -eo user,pid,ppid,lwp,%cpu,%mem,size,rss,cmd > ${LOG_DIR}/system_logs/ps.txt + + # available entropy + cat /proc/sys/kernel/random/entropy_avail > ${LOG_DIR}/system_logs/entropy_avail.txt + + # docker related information + (docker info && docker images && docker ps -a) > ${LOG_DIR}/system_logs/docker-info.txt + + for container in $(docker ps -a --format "{{.Names}}"); do + docker logs --tail all ${container} &> ${LOG_DIR}/docker_logs/${container}.txt + done + + # Bifrost: grab config files and logs from the container. + if [[ $(docker ps -q -f name=bifrost_deploy) ]]; then + mkdir -p ${LOG_DIR}/bifrost + for service in dnsmasq ironic-api ironic-conductor ironic-inspector mariadb nginx rabbitmq-server; do + mkdir -p ${LOG_DIR}/bifrost/$service + docker exec bifrost_deploy \ + systemctl status $service -l -n 10000 > ${LOG_DIR}/bifrost/$service/${service}-systemd-status.txt + docker exec bifrost_deploy \ + journalctl -u $service --no-pager > ${LOG_DIR}/bifrost/$service/${service}-journal.txt + done + docker exec -it bifrost_deploy \ + journalctl --no-pager > ${LOG_DIR}/bifrost/bifrost-journal.log + for d in dnsmasq.conf ironic ironic-inspector nginx/nginx.conf; do + docker cp bifrost_deploy:/etc/$d ${LOG_DIR}/kolla_node_configs/bifrost/ + done + docker cp bifrost_deploy:/var/log/mariadb/mariadb.log ${LOG_DIR}/bifrost/mariadb/ + fi + + # IPA build logs + if [[ -f /opt/kayobe/images/ipa/ipa.stderr ]] || [[ -f /opt/kayobe/images/ipa/ipa.stdout ]]; then + mkdir -p ${LOG_DIR}/ipa + cp /opt/kayobe/images/ipa/ipa.stderr /opt/kayobe/images/ipa/ipa.stdout ${LOG_DIR}/ipa/ + fi + + # Overcloud host image build logs + if [[ -f /opt/kayobe/images/deployment_image/deployment_image.stderr ]] || [[ -f /opt/kayobe/images/deployment_image/deployment_image.stdout ]]; then + mkdir -p ${LOG_DIR}/deployment_image + cp /opt/kayobe/images/deployment_image/deployment_image.stderr /opt/kayobe/images/deployment_image/deployment_image.stdout ${LOG_DIR}/deployment_image/ + fi + + chown -R stack: ${LOG_DIR} +} + +copy_logs