diff --git a/doc/source/configuration/monitoring.rst b/doc/source/configuration/monitoring.rst index f9c8210b9..9abe187fb 100644 --- a/doc/source/configuration/monitoring.rst +++ b/doc/source/configuration/monitoring.rst @@ -137,6 +137,8 @@ depending on your configuration, you may need set the ``kolla_enable_prometheus_ceph_mgr_exporter`` variable to ``true`` in order to enable the ceph mgr exporter. +.. _os-capacity: + OpenStack Capacity ================== @@ -160,9 +162,19 @@ project domain name in ``stackhpc-monitoring.yml``: stackhpc_os_capacity_openstack_region_name: Additionally, you should ensure these credentials have the correct permissions -for the exporter. If you are deploying in a cloud with internal TLS, you may be required -to disable certificate verification for the OpenStack Capacity exporter -if your certificate is not signed by a trusted CA. +for the exporter. + +If you are deploying in a cloud with internal TLS, you may be required +to provide a CA certificate for the OpenStack Capacity exporter if your +certificate is not signed by a trusted CA. For example, to use a CA certificate +named ``vault.crt`` that is also added to the Kolla containers: + +.. code-block:: yaml + + stackhpc_os_capacity_openstack_cacert: "{{ kayobe_env_config_path }}/kolla/certificates/ca/vault.crt" + +Alternatively, to disable certificate verification for the OpenStack Capacity +exporter: .. code-block:: yaml diff --git a/doc/source/configuration/release-train.rst b/doc/source/configuration/release-train.rst index f7cdafc85..0d62fadfd 100644 --- a/doc/source/configuration/release-train.rst +++ b/doc/source/configuration/release-train.rst @@ -147,6 +147,33 @@ By default, HashiCorp images (Consul and Vault) are not synced from Docker Hub to the local Pulp. To sync these images, set ``stackhpc_sync_hashicorp_images`` to ``true``. +Custom container images +----------------------- + +A custom list of container images can be synced to the local Pulp using the +``stackhpc_pulp_repository_container_repos_extra`` and +``stackhpc_pulp_distribution_container_extra`` variables. + +.. code-block:: yaml + + # List of extra container image repositories. + stackhpc_pulp_repository_container_repos_extra: + - name: "certbot/certbot" + url: "https://registry-1.docker.io" + policy: on_demand + proxy_url: "{{ pulp_proxy_url }}" + state: present + include_tags: "nightly" + required: True + + # List of extra container image distributions. + stackhpc_pulp_distribution_container_extra: + - name: certbot + repository: certbot/certbot + base_path: certbot/certbot + state: present + required: True + Usage ===== diff --git a/doc/source/configuration/vault.rst b/doc/source/configuration/vault.rst index a63dd0da3..b34d77e43 100644 --- a/doc/source/configuration/vault.rst +++ b/doc/source/configuration/vault.rst @@ -196,6 +196,8 @@ Enable the required TLS variables in kayobe and kolla # Whether TLS is enabled for the internal API endpoints. Default is 'no'. kolla_enable_tls_internal: yes + See :ref:`os-capacity` for information on adding CA certificates to the trust store when deploying the OpenStack Capacity exporter. + 2. Set the following in etc/kayobe/kolla/globals.yml or if environments are being used etc/kayobe/environments/$KAYOBE_ENVIRONMENT/kolla/globals.yml .. code-block:: diff --git a/doc/source/operations/secret-rotation.rst b/doc/source/operations/secret-rotation.rst index f3213da37..5761fd4d6 100644 --- a/doc/source/operations/secret-rotation.rst +++ b/doc/source/operations/secret-rotation.rst @@ -46,7 +46,7 @@ process easier. This was previously mitigated with a change to the StackHPC fork of Kolla-Ansible, which has since been reverted due to an unforeseen issue. See - `here ` for more + `here `__ for more details. #. A change to Nova, to automate :ref:`this` step to change the diff --git a/etc/kayobe/ansible/deploy-os-capacity-exporter.yml b/etc/kayobe/ansible/deploy-os-capacity-exporter.yml index cc3afa7b0..41d91bfbd 100644 --- a/etc/kayobe/ansible/deploy-os-capacity-exporter.yml +++ b/etc/kayobe/ansible/deploy-os-capacity-exporter.yml @@ -27,6 +27,7 @@ delegate_to: localhost register: credential when: stackhpc_enable_os_capacity + changed_when: false - name: Set facts for admin credentials ansible.builtin.set_fact: @@ -43,6 +44,16 @@ src: templates/os_capacity-clouds.yml.j2 dest: /opt/kayobe/os-capacity/clouds.yaml when: stackhpc_enable_os_capacity + register: clouds_yaml_result + + - name: Copy CA certificate to OpenStack Capacity nodes + ansible.builtin.copy: + src: "{{ stackhpc_os_capacity_openstack_cacert }}" + dest: /opt/kayobe/os-capacity/cacert.pem + when: + - stackhpc_enable_os_capacity + - stackhpc_os_capacity_openstack_cacert | length > 0 + register: cacert_result - name: Ensure os_capacity container is running community.docker.docker_container: @@ -56,6 +67,7 @@ source: /opt/kayobe/os-capacity/ target: /etc/openstack/ network_mode: host + restart: "{{ clouds_yaml_result is changed or cacert_result is changed }}" restart_policy: unless-stopped become: true when: stackhpc_enable_os_capacity diff --git a/etc/kayobe/ansible/templates/os_capacity-clouds.yml.j2 b/etc/kayobe/ansible/templates/os_capacity-clouds.yml.j2 index ef3c8d7a5..6475848ba 100644 --- a/etc/kayobe/ansible/templates/os_capacity-clouds.yml.j2 +++ b/etc/kayobe/ansible/templates/os_capacity-clouds.yml.j2 @@ -10,6 +10,9 @@ clouds: interface: "internal" identity_api_version: 3 auth_type: "password" +{% if stackhpc_os_capacity_openstack_cacert | length > 0 %} + cacert: /etc/openstack/cacert.pem +{% endif %} {% if not stackhpc_os_capacity_openstack_verify | bool %} verify: False {% endif %} diff --git a/etc/kayobe/environments/ci-multinode/stackhpc-monitoring.yml b/etc/kayobe/environments/ci-multinode/stackhpc-monitoring.yml new file mode 100644 index 000000000..93ce650b4 --- /dev/null +++ b/etc/kayobe/environments/ci-multinode/stackhpc-monitoring.yml @@ -0,0 +1,3 @@ +--- +# Path to a CA certificate file to trust in the OpenStack Capacity exporter. +stackhpc_os_capacity_openstack_cacert: "{{ kayobe_env_config_path }}/kolla/certificates/ca/vault.crt" diff --git a/etc/kayobe/kolla.yml b/etc/kayobe/kolla.yml index 3997dcf52..3a1f37051 100644 --- a/etc/kayobe/kolla.yml +++ b/etc/kayobe/kolla.yml @@ -339,6 +339,7 @@ kolla_build_blocks: ARG prometheus_msteams_sha256sum=0f4df9ee31e655d1ec876ea2c53ab5ae5b07143ef21b9190e61b4d52839e135c ARG prometheus_msteams_url=https://github.com/prometheus-msteams/prometheus-msteams/releases/download/v${prometheus_msteams_version}/prometheus-msteams-linux-{{debian_arch}} {% endraw %} + # Dict mapping image customization variable names to their values. # Each variable takes the form: # __ diff --git a/etc/kayobe/kolla/config/prometheus/system.rules b/etc/kayobe/kolla/config/prometheus/system.rules index 613368be6..7981a5609 100644 --- a/etc/kayobe/kolla/config/prometheus/system.rules +++ b/etc/kayobe/kolla/config/prometheus/system.rules @@ -24,6 +24,24 @@ groups: summary: "Prometheus exporter at {{ $labels.instance }} reports low memory" description: "Available memory is {{ $value }} GiB." + - alert: LowSwapSpace + expr: (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes) < {% endraw %}{{ alertmanager_node_free_swap_warning_threshold_ratio }}{% raw %} + for: 1m + labels: + severity: warning + annotations: + summary: "Swap space at {{ $labels.instance }} reports low memory" + description: "Available swap space is {{ $value | humanizePercentage }}. Running out of swap space causes OOM Kills." + + - alert: LowSwapSpace + expr: (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes) < {% endraw %}{{ alertmanager_node_free_swap_critical_threshold_ratio }}{% raw %} + for: 1m + labels: + severity: critical + annotations: + summary: "Swap space at {{ $labels.instance }} reports low memory" + description: "Available swap space is {{ $value | humanizePercentage }}. Running out of swap space causes OOM Kills." + - alert: HostOomKillDetected expr: increase(node_vmstat_oom_kill[5m]) > 0 for: 5m diff --git a/etc/kayobe/pulp.yml b/etc/kayobe/pulp.yml index f10f5ab65..df7df353b 100644 --- a/etc/kayobe/pulp.yml +++ b/etc/kayobe/pulp.yml @@ -651,14 +651,22 @@ stackhpc_pulp_distribution_container_hashicorp: state: present required: "{{ stackhpc_sync_hashicorp_images | bool }}" +# List of extra container image repositories. +stackhpc_pulp_repository_container_repos_extra: [] + +# List of extra container image distributions. +stackhpc_pulp_distribution_container_extra: [] + # List of container image repositories. stackhpc_pulp_repository_container_repos: >- {{ (stackhpc_pulp_repository_container_repos_kolla + stackhpc_pulp_repository_container_repos_ceph + - stackhpc_pulp_repository_container_repos_hashicorp) | selectattr('required') }} + stackhpc_pulp_repository_container_repos_hashicorp + + stackhpc_pulp_repository_container_repos_extra) | selectattr('required') }} # List of container image distributions. stackhpc_pulp_distribution_container: >- {{ (stackhpc_pulp_distribution_container_kolla + stackhpc_pulp_distribution_container_ceph + - stackhpc_pulp_distribution_container_hashicorp) | selectattr('required') }} + stackhpc_pulp_distribution_container_hashicorp + + stackhpc_pulp_distribution_container_extra) | selectattr('required') }} diff --git a/etc/kayobe/stackhpc-monitoring.yml b/etc/kayobe/stackhpc-monitoring.yml index e8e0bb91f..d6d1c1a76 100644 --- a/etc/kayobe/stackhpc-monitoring.yml +++ b/etc/kayobe/stackhpc-monitoring.yml @@ -12,6 +12,12 @@ alertmanager_low_memory_threshold_gib: 5 # link. Change to false to disable this alert. alertmanager_warn_network_bond_single_link: true +# Threshold to trigger an LowSwapSpace alert on swap space depletion (ratio). +# When the ratio of free swap space is lower than each of these values, warning +# and critical alerts will be triggered respectively. +alertmanager_node_free_swap_warning_threshold_ratio: 0.25 +alertmanager_node_free_swap_critical_threshold_ratio: 0.1 + ############################################################################### # Exporter configuration @@ -20,6 +26,9 @@ alertmanager_warn_network_bond_single_link: true # targets being templated during deployment. stackhpc_enable_os_capacity: true +# Path to a CA certificate file to trust in the OpenStack Capacity exporter. +stackhpc_os_capacity_openstack_cacert: "" + # Whether TLS certificate verification is enabled for the OpenStack Capacity # exporter during Keystone authentication. stackhpc_os_capacity_openstack_verify: true diff --git a/releasenotes/notes/add-alerts-for-swap-availability-75e28ed7f913d1ec.yaml b/releasenotes/notes/add-alerts-for-swap-availability-75e28ed7f913d1ec.yaml new file mode 100644 index 000000000..db5efb85c --- /dev/null +++ b/releasenotes/notes/add-alerts-for-swap-availability-75e28ed7f913d1ec.yaml @@ -0,0 +1,13 @@ +--- +features: + - | + Added two alerts (Warning and critical) that are triggered when the ratio + of (free_swap_sppace / total_swap_space) is below thresholds. + Each threshold can be modified by alterting value of + ``alertmanager_node_free_swap_warning_threshold_ratio`` and + ``alertmanager_node_free_swap_critical_threshold_ratio``. + + Currently this solution has limitation of having one-size fits all policy. + This can cause unwanted alerts for the hosts which utilise swap heavily + Therefore it is recommended to tune the thresholds or apply silence rules + for the needs. diff --git a/releasenotes/notes/os-capacity-cacert-8b800b22d84ae0b1.yaml b/releasenotes/notes/os-capacity-cacert-8b800b22d84ae0b1.yaml new file mode 100644 index 000000000..e20ee0714 --- /dev/null +++ b/releasenotes/notes/os-capacity-cacert-8b800b22d84ae0b1.yaml @@ -0,0 +1,4 @@ +--- +features: + - | + Adds support for providing a CA certificate for OpenStack Capacity exporter. diff --git a/releasenotes/notes/pulp-container-extra-9379806192900d22.yaml b/releasenotes/notes/pulp-container-extra-9379806192900d22.yaml new file mode 100644 index 000000000..a725a4863 --- /dev/null +++ b/releasenotes/notes/pulp-container-extra-9379806192900d22.yaml @@ -0,0 +1,6 @@ +--- +features: + - | + Allows to synchronise a custom list of containers to Pulp using the + ``stackhpc_pulp_repository_container_repos_extra`` and + ``stackhpc_pulp_distribution_container_extra`` variables.