From 86a01a46597069254d42a0bbd5655a72ebc210ac Mon Sep 17 00:00:00 2001 From: Bartosz Bezak Date: Mon, 29 Mar 2021 17:23:13 +0200 Subject: [PATCH 1/2] bump to 2021 --- source/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/conf.py b/source/conf.py index 31f4f03..e4033b9 100644 --- a/source/conf.py +++ b/source/conf.py @@ -18,7 +18,7 @@ # -- Project information ----------------------------------------------------- project = 'OpenStack Administration Guide' -copyright = '2020, StackHPC Ltd' +copyright = '2021, StackHPC Ltd' author = 'StackHPC Ltd' From 1ce5bae46beb0a635744cc57430f56cd4e2b38a3 Mon Sep 17 00:00:00 2001 From: Bartosz Bezak Date: Mon, 29 Mar 2021 17:24:53 +0200 Subject: [PATCH 2/2] Full shutdown document added --- source/full_shutdown.rst | 206 +++++++++++++++++++++++++++ source/index.rst | 1 + source/operations_and_monitoring.rst | 16 +-- 3 files changed, 209 insertions(+), 14 deletions(-) create mode 100644 source/full_shutdown.rst diff --git a/source/full_shutdown.rst b/source/full_shutdown.rst new file mode 100644 index 0000000..dbbc1d9 --- /dev/null +++ b/source/full_shutdown.rst @@ -0,0 +1,206 @@ +.. include:: vars.rst + +======================= +Full Shutdown Procedure +======================= + +In case a full shutdown of the system is required, we advise to use the +following order: + +* Perform a graceful shutdown of all virtual machine instances +* Stop Ceph (if applicable) +* Put all nodes into maintenance mode in Bifrost +* Shut down compute nodes +* Shut down monitoring node +* Shut down network nodes (if separate from controllers) +* Shut down controllers +* Shut down Ceph nodes (if applicable) +* Shut down seed VM +* Shut down Ansible control host + +Virtual Machines shutdown +------------------------- + +Contact Openstack users to stop their virtual machines gracefully, +If that is not possible shut down VMs using openstack CLI as admin user: + +.. code-block:: bash + + for i in `openstack server list --all-projects -c ID -f value` ; \ + do openstack server stop $i ; done + + +.. ifconfig:: deployment['ceph_managed'] + + Stop Ceph + --------- + Procedure based on `Red Hat documentation `__ + + - Stop the Ceph clients from using any Ceph resources (RBD, RADOS Gateway, CephFS) + - Check if cluster is in healthy state + + .. code-block:: bash + + ceph status + + - Stop CephFS (if applicable) + + Stop CephFS cluster by reducing the number of ranks to 1, setting the cluster_down flag, and then failing the last rank. + + .. code-block:: bash + + ceph fs set FS_NAME max_mds 1 + ceph mds deactivate FS_NAME:1 # rank 2 of 2 + ceph status # wait for rank 1 to finish stopping + ceph fs set FS_NAME cluster_down true + ceph mds fail FS_NAME:0 + + Setting the cluster_down flag prevents standbys from taking over the failed rank. + + - Set the noout, norecover, norebalance, nobackfill, nodown and pause flags. + + .. code-block:: bash + + ceph osd set noout + ceph osd set norecover + ceph osd set norebalance + ceph osd set nobackfill + ceph osd set nodown + ceph osd set pause + + - Shut down the OSD nodes one by one: + + .. code-block:: bash + + systemctl stop ceph-osd.target + + - Shut down the monitor/manager nodes one by one: + + .. code-block:: bash + + systemctl stop ceph.target + +Set Bifrost maintenance mode +---------------------------- + +Set maintenance mode in bifrost to prevent nodes from automatically +powering back on + +.. code-block:: bash + + bifrost# for i in `openstack --os-cloud bifrost baremetal node list -c UUID -f value` ; \ + do openstack --os-cloud bifrost baremetal node maintenance set --reason full-shutdown $i ; done + +Shut down nodes +--------------- + +Shut down nodes one at a time gracefully using: + +.. code-block:: bash + + systemctl poweroff + +Shut down the seed VM +--------------------- + +Shut down seed vm on ansible control host gracefully using: + +.. code-block:: bash + :substitutions: + + ssh stack@|seed_name| sudo systemctl poweroff + virsh shutdown |seed_name| + +.. _full-power-on: + +Full Power on Procedure +----------------------- + +* Start ansible control host and seed vm +* Remove nodes from maintenance mode in bifrost +* Recover MariaDB cluster +* Start Ceph (if applicable) +* Check that all docker containers are running +* Check Kibana for any messages with log level ERROR or equivalent + +Start Ansible Control Host +-------------------------- + +The Ansible control host is not enrolled in Bifrost and will have to be powered +on manually. + +Start Seed VM +------------- + +The seed VM (and any other service VM) should start automatically when the seed +hypervisor is powered on. If it does not, it can be started with: + +.. code-block:: bash + + virsh start seed-0 + +Unset Bifrost maintenance mode +------------------------------ + +Unsetting maintenance mode in bifrost should automatically power on the nodes + +.. code-block:: bash + + bifrost# for i in `openstack --os-cloud bifrost baremetal node list -c UUID -f value` ; \ + do openstack --os-cloud bifrost baremetal node maintenance unset $i ; done + +Recover MariaDB cluster +----------------------- + +If all of the servers were shut down at the same time, it is necessary to run a +script to recover the database once they have all started up. This can be done +with the following command: + +.. code-block:: bash + + kayobe# kayobe overcloud database recover + +.. ifconfig:: deployment['ceph_managed'] + + Start Ceph + ---------- + Procedure based on `Red Hat documentation `__ + + - Start monitor/manager nodes: + + .. code-block:: bash + + systemctl start ceph.target + + - Start the OSD nodes: + + .. code-block:: bash + + systemctl start ceph-osd.target + + - Wait for all the nodes to come up + + - Unset the noout, norecover, norebalance, nobackfill, nodown and pause flags + + .. code-block:: bash + + ceph osd unset noout + ceph osd unset norecover + ceph osd unset norebalance + ceph osd unset nobackfill + ceph osd unset nodown + ceph osd unset pause + + - Start CephFS (if applicable) + + CephFS cluster must be brought back up by setting the cluster_down flag to false + + .. code-block:: bash + + ceph fs set FS_NAME cluster_down false + + - Verify ceph cluster status + + .. code-block:: bash + + ceph status diff --git a/source/index.rst b/source/index.rst index 7e8db2c..6c26e87 100644 --- a/source/index.rst +++ b/source/index.rst @@ -24,6 +24,7 @@ Contents ceph_storage managing_users_and_projects operations_and_monitoring + full_shutdown customising_deployment gpus_in_openstack diff --git a/source/operations_and_monitoring.rst b/source/operations_and_monitoring.rst index c564cfa..88b993d 100644 --- a/source/operations_and_monitoring.rst +++ b/source/operations_and_monitoring.rst @@ -502,22 +502,10 @@ Shutting down the seed VM kayobe# ssh stack@|seed_name| sudo systemctl poweroff kayobe# virsh shutdown |seed_name| -.. _full-shutdown: - Full shutdown ------------- -In case a full shutdown of the system is required, we advise to use the -following order: - -* Perform a graceful shutdown of all virtual machine instances -* Shut down compute nodes -* Shut down monitoring node -* Shut down network nodes (if separate from controllers) -* Shut down controllers -* Shut down Ceph nodes (if applicable) -* Shut down seed VM -* Shut down Ansible control host +Follow separate :doc:`document `. Rebooting a node ---------------- @@ -575,7 +563,7 @@ hypervisor is powered on. If it does not, it can be started with: Full power on ------------- -Follow the order in :ref:`full-shutdown`, but in reverse order. +Follow separate :ref:`document `. Shutting Down / Restarting Monitoring Services ----------------------------------------------