Skip to content

Commit 550cf03

Browse files
committed
Update smart metrics to include rated DWPD
1 parent 0f326d8 commit 550cf03

File tree

3 files changed

+195
-22
lines changed

3 files changed

+195
-22
lines changed
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
---
2+
- name: Gather unique NVMe disk models
3+
hosts: overcloud
4+
gather_facts: no
5+
tasks:
6+
- name: Retrieve NVMe device information
7+
command: "nvme list -o json"
8+
register: nvme_list
9+
changed_when: false
10+
become: true
11+
12+
- name: Parse NVMe device model names
13+
set_fact:
14+
nvme_models: "{{ nvme_models | default([]) + [item.ModelNumber] }}"
15+
loop: "{{ nvme_list.stdout | from_json | json_query('Devices[].{ModelNumber: ModelNumber}') }}"
16+
changed_when: false
17+
18+
- name: Set unique NVMe models as host facts
19+
set_fact:
20+
unique_nvme_models: "{{ nvme_models | unique }}"
21+
run_once: true
22+
23+
- name: Update or create dwpd-ratings.yml
24+
hosts: localhost
25+
gather_facts: no
26+
tasks:
27+
- name: Aggregate unique NVMe models from all overcloud hosts
28+
set_fact:
29+
all_nvme_models: "{{ all_nvme_models | default([]) | union(hostvars[item].unique_nvme_models | default([])) }}"
30+
with_items: "{{ groups['overcloud'] }}"
31+
run_once: true
32+
33+
- name: Ensure dwpd-ratings.yml exists
34+
stat:
35+
path: "{{ kayobe_env_config_path }}/dwpd-ratings.yml"
36+
register: dwpd_ratings_stat
37+
run_once: true
38+
39+
- name: Load existing dwpd-ratings.yml
40+
set_fact:
41+
existing_dwpd_yml: "{{ lookup('file', kayobe_env_config_path ~ '/dwpd-ratings.yml') | from_yaml }}"
42+
when: dwpd_ratings_stat.stat.exists
43+
run_once: true
44+
45+
- name: Convert existing YAML array into a dictionary
46+
set_fact:
47+
dwpd_lookup: "{{ dwpd_lookup | default({}) | combine({ item.model_name: item.rated_dwpd }) }}"
48+
loop: "{{ existing_dwpd_yml.stackhpc_dwpd_ratings | default([]) }}"
49+
loop_control:
50+
label: "{{ item.model_name }}"
51+
run_once: true
52+
53+
54+
- name: Get list of existing model names
55+
set_fact:
56+
existing_model_names: "{{ existing_dwpd_yml.stackhpc_dwpd_ratings|default([]) | map(attribute='model_name') | list }}"
57+
run_once: true
58+
59+
- name: Combine old + new model names
60+
set_fact:
61+
combined_models: "{{ existing_model_names | union(all_nvme_models | default([])) | unique }}"
62+
run_once: true
63+
64+
- name: DEBUG
65+
debug:
66+
msg: "{{ dwpd_lookup[item] }}"
67+
with_items: "{{ combined_models }}"
68+
run_once: true
69+
70+
- name: Build updated list for stackhpc_dwpd_ratings
71+
set_fact:
72+
new_dwpd_list: >-
73+
{{ new_dwpd_list | default([]) + [{
74+
'model_name': item,
75+
'rated_dwpd': dwpd_lookup[item] | default(1)
76+
}] }}
77+
with_items: "{{ combined_models }}"
78+
loop_control:
79+
loop_var: item
80+
run_once: true
81+
82+
- name: Write updated dwpd-ratings.yml
83+
copy:
84+
content: "{{ {'stackhpc_dwpd_ratings': new_dwpd_list} | to_nice_yaml }}"
85+
dest: "{{ kayobe_env_config_path }}/dwpd-ratings.yml"
86+
run_once: true
87+
88+
- name: Show updated dwpd-ratings.yml contents
89+
debug:
90+
msg:
91+
- "Updated local dwpd-ratings.yml contents"
92+
- "{{ {'stackhpc_dwpd_ratings': new_dwpd_list} | to_nice_yaml }}"
93+
- "PLEASE REVIEW AND COMMIT {{ kayobe_env_config_path }}/dwpd-ratings.yml TO VERSION CONTROL."
94+
run_once: true

etc/kayobe/ansible/scripts/nvmemon.sh

Lines changed: 68 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,43 @@ if ! command -v nvme >/dev/null 2>&1; then
2121
exit 1
2222
fi
2323

24+
if ! command -v jq >/dev/null 2>&1; then
25+
echo "${0##*/}: jq is required but not installed. Aborting." >&2
26+
exit 1
27+
fi
28+
29+
# Path to the DWPD ratings JSON file
30+
dwpd_file="/opt/kayobe/etc/monitoring/dwpd_ratings.json"
31+
32+
declare -A rated_dwpd
33+
34+
load_dwpd_ratings() {
35+
if [[ -f "$dwpd_file" ]]; then
36+
# Read the JSON; if it fails, default to empty array
37+
dwpd_json="$(cat "$dwpd_file" 2>/dev/null | jq '.' || echo '[]')"
38+
39+
# We iterate over each array element in dwpd_json
40+
while IFS= read -r line; do
41+
key="$(echo "$line" | jq -r '.model_name')"
42+
value="$(echo "$line" | jq -r '.rated_dwpd')"
43+
44+
# Clean up trailing whitespace
45+
key="${key%%[[:space:]]*}"
46+
value="${value%%[[:space:]]*}"
47+
48+
# If we have a valid key, store it in the dictionary
49+
if [[ -n "$key" && "$key" != "null" ]]; then
50+
rated_dwpd["$key"]="$value"
51+
fi
52+
done < <(echo "$dwpd_json" | jq -c '.[]')
53+
else
54+
echo "Warning: DWPD ratings file not found at '$dwpd_file'. Defaulting to rated_dwpd=1."
55+
fi
56+
}
57+
58+
59+
load_dwpd_ratings
60+
2461
output_format_awk="$(
2562
cat <<'OUTPUTAWK'
2663
BEGIN { v = "" }
@@ -44,58 +81,69 @@ format_output() {
4481
nvme_version="$(nvme version | awk '$1 == "nvme" {print $3}')"
4582
echo "nvmecli{version=\"${nvme_version}\"} 1" | format_output
4683

47-
# Get devices (DevicePath and PhysicalSize)
48-
device_info="$(nvme list -o json | jq -c '.Devices[] | {DevicePath: .DevicePath, PhysicalSize: .PhysicalSize}')"
84+
# Get devices (DevicePath, PhysicalSize and ModelNumber)
85+
device_info="$(nvme list -o json | jq -c '.Devices[] | {DevicePath, PhysicalSize, ModelNumber}')"
86+
87+
# Convert device_info to an array
88+
device_info_array=()
89+
while IFS= read -r line; do
90+
device_info_array+=("$line")
91+
done <<< "$device_info"
4992

5093
# Loop through the NVMe devices
51-
echo "$device_info" | while read -r device_data; do
52-
device=$(echo "$device_data" | jq -r '.DevicePath')
94+
for device_data in "${device_info_array[@]}"; do
95+
device="$(echo "$device_data" | jq -r '.DevicePath')"
5396
json_check="$(nvme smart-log -o json "${device}")"
5497
disk="${device##*/}"
98+
model_name="$(echo "$device_data" | jq -r '.ModelNumber')"
5599

56-
physical_size=$(echo "$device_data" | jq -r '.PhysicalSize')
57-
echo "physical_size_bytes{device=\"${disk}\"} ${physical_size}"
100+
physical_size="$(echo "$device_data" | jq -r '.PhysicalSize')"
101+
echo "physical_size_bytes{device=\"${disk}\",model=\"${model_name}\"} ${physical_size}"
58102

59103
# The temperature value in JSON is in Kelvin, we want Celsius
60104
value_temperature="$(echo "$json_check" | jq '.temperature - 273')"
61-
echo "temperature_celsius{device=\"${disk}\"} ${value_temperature}"
105+
echo "temperature_celsius{device=\"${disk}\",model=\"${model_name}\"} ${value_temperature}"
106+
107+
# Get the rated DWPD from the dictionary or default to 1 if not found
108+
value_rated_dwpd="${rated_dwpd[$model_name]:-1}"
109+
echo "rated_dwpd{device=\"${disk}\",model=\"${model_name}\"} ${value_rated_dwpd}"
62110

63111
value_available_spare="$(echo "$json_check" | jq '.avail_spare / 100')"
64-
echo "available_spare_ratio{device=\"${disk}\"} ${value_available_spare}"
112+
echo "available_spare_ratio{device=\"${disk}\",model=\"${model_name}\"} ${value_available_spare}"
65113

66114
value_available_spare_threshold="$(echo "$json_check" | jq '.spare_thresh / 100')"
67-
echo "available_spare_threshold_ratio{device=\"${disk}\"} ${value_available_spare_threshold}"
115+
echo "available_spare_threshold_ratio{device=\"${disk}\",model=\"${model_name}\"} ${value_available_spare_threshold}"
68116

69117
value_percentage_used="$(echo "$json_check" | jq '.percent_used / 100')"
70-
echo "percentage_used_ratio{device=\"${disk}\"} ${value_percentage_used}"
118+
echo "percentage_used_ratio{device=\"${disk}\",model=\"${model_name}\"} ${value_percentage_used}"
71119

72120
value_critical_warning="$(echo "$json_check" | jq '.critical_warning')"
73-
echo "critical_warning_total{device=\"${disk}\"} ${value_critical_warning}"
121+
echo "critical_warning_total{device=\"${disk}\",model=\"${model_name}\"} ${value_critical_warning}"
74122

75123
value_media_errors="$(echo "$json_check" | jq '.media_errors')"
76-
echo "media_errors_total{device=\"${disk}\"} ${value_media_errors}"
124+
echo "media_errors_total{device=\"${disk}\",model=\"${model_name}\"} ${value_media_errors}"
77125

78126
value_num_err_log_entries="$(echo "$json_check" | jq '.num_err_log_entries')"
79-
echo "num_err_log_entries_total{device=\"${disk}\"} ${value_num_err_log_entries}"
127+
echo "num_err_log_entries_total{device=\"${disk}\",model=\"${model_name}\"} ${value_num_err_log_entries}"
80128

81129
value_power_cycles="$(echo "$json_check" | jq '.power_cycles')"
82-
echo "power_cycles_total{device=\"${disk}\"} ${value_power_cycles}"
130+
echo "power_cycles_total{device=\"${disk}\",model=\"${model_name}\"} ${value_power_cycles}"
83131

84132
value_power_on_hours="$(echo "$json_check" | jq '.power_on_hours')"
85-
echo "power_on_hours_total{device=\"${disk}\"} ${value_power_on_hours}"
133+
echo "power_on_hours_total{device=\"${disk}\",model=\"${model_name}\"} ${value_power_on_hours}"
86134

87135
value_controller_busy_time="$(echo "$json_check" | jq '.controller_busy_time')"
88-
echo "controller_busy_time_seconds{device=\"${disk}\"} ${value_controller_busy_time}"
136+
echo "controller_busy_time_seconds{device=\"${disk}\",model=\"${model_name}\"} ${value_controller_busy_time}"
89137

90138
value_data_units_written="$(echo "$json_check" | jq '.data_units_written')"
91-
echo "data_units_written_total{device=\"${disk}\"} ${value_data_units_written}"
139+
echo "data_units_written_total{device=\"${disk}\",model=\"${model_name}\"} ${value_data_units_written}"
92140

93141
value_data_units_read="$(echo "$json_check" | jq '.data_units_read')"
94-
echo "data_units_read_total{device=\"${disk}\"} ${value_data_units_read}"
142+
echo "data_units_read_total{device=\"${disk}\",model=\"${model_name}\"} ${value_data_units_read}"
95143

96144
value_host_read_commands="$(echo "$json_check" | jq '.host_read_commands')"
97-
echo "host_read_commands_total{device=\"${disk}\"} ${value_host_read_commands}"
145+
echo "host_read_commands_total{device=\"${disk}\",model=\"${model_name}\"} ${value_host_read_commands}"
98146

99147
value_host_write_commands="$(echo "$json_check" | jq '.host_write_commands')"
100-
echo "host_write_commands_total{device=\"${disk}\"} ${value_host_write_commands}"
148+
echo "host_write_commands_total{device=\"${disk}\",model=\"${model_name}\"} ${value_host_write_commands}"
101149
done | format_output

etc/kayobe/ansible/smartmon-tools.yml

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
---
2-
- name: Install and set up smartmon-tools
2+
- name: Install and set up SMART monitoring tools
33
hosts: overcloud
4-
54
tasks:
65
- name: Ensure smartmontools, jq, nvme-cli and cron/cronie are installed
76
ansible.builtin.package:
@@ -53,3 +52,35 @@
5352
- smartmon
5453
- nvmemon
5554
become: true
55+
56+
- name: Gather NVMe drives and generate dwpd ratings
57+
import_playbook: get-nvme-drives.yml
58+
when: create_dwpd_ratings | default(false)
59+
60+
- hosts: overcloud
61+
gather_facts: false
62+
tasks:
63+
- name: Convert the stackhpc_dwpd_ratings variable to JSON
64+
set_fact:
65+
dwpd_ratings_json: "{{ stackhpc_dwpd_ratings | default([]) | to_json }}"
66+
run_once: true
67+
when: stackhpc_dwpd_ratings is defined
68+
69+
- name: Ensure /opt/kayobe/etc/monitoring directory exists
70+
become: true
71+
file:
72+
path: /opt/kayobe/etc/monitoring
73+
state: directory
74+
mode: '0755'
75+
when: stackhpc_dwpd_ratings is defined
76+
77+
- name: Copy JSON file to remote
78+
become: true
79+
copy:
80+
content: "{{ dwpd_ratings_json }}"
81+
dest: "/opt/kayobe/etc/monitoring/dwpd_ratings.json"
82+
owner: root
83+
group: root
84+
mode: '0644'
85+
run_once: true
86+
when: stackhpc_dwpd_ratings is defined

0 commit comments

Comments
 (0)