-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnvme.sh
129 lines (98 loc) · 5.33 KB
/
nvme.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env bash
#set -eu
#set -x #debug mode
# Ensure predictable numeric / date formats, etc.
export LC_ALL=C
export PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
# Check if we are root
# if [ "$EUID" -ne 0 ]; then
# echo "${0##*/}: Please run as root!" >&2
# exit 1
# fi
output_format_awk="$(
cat <<'OUTPUTAWK'
BEGIN { v = "" }
v != $1 {
print "# HELP disk_" $1 " SMART metric " $1;
if ($1 ~ /_total$/)
print "# TYPE disk_" $1 " counter";
else
print "# TYPE disk_" $1 " gauge";
v = $1
}
{print "disk_" $0}
OUTPUTAWK
)"
format_output() {
sort | awk -F'{' "${output_format_awk}"
}
# Get devices
device_list="$(lsblk -d -n -o NAME | grep -E '^' | grep -v 'loop'|grep -v 'sd*')"
# Loop through the NVMe devices
for disk in ${device_list}; do
device="/dev/${disk}"
smartctl_output="$(smartctl -a -j ${device})"
smartctl_health="$(smartctl -H ${device})"
smartctl_output_capacity="$(smartctl -i ${device})"
disk="${device##*/}"
#-------------------------全局通用指标-------------------------
value_device="$device"
echo "device{device=\"${disk}\"} 1"
#获取磁盘的model name
value_model_name="$(echo "$smartctl_output" | jq -r '.model_name')"
model_name_value=1 #此处1无意义,只是单纯传输,以满足node-exporter采集的要求。
echo "model_name{device=\"${disk}\", model_name=\"${value_model_name}\"} ${model_name_value}"
#-------------------------全局通用指标-------------------------
#-------------------------nvme设备指标-------------------------
if [[ "$disk" == nvme* ]]; then
# NVMe disk (nvme*)
value_disk_health="$(echo "$smartctl_health" | grep 'result' | awk '{print $6}')"
# 设置健康状态和对应的值
if [[ "$value_disk_health" == "PASSED" ]]; then
health_status="PASSED"
health_value=1
elif [[ "$value_disk_health" == "FAILED" ]]; then
health_status="FAILED"
health_value=0
else
health_status="UNKNOWN"
health_value=0 # 其他情况也设为 0
fi
# 输出健康状态,使用字符串作为标签
echo "nvme_health{device=\"${disk}\", status=\"${health_status}\"} ${health_value}"
value_nvme_temperature="$(echo "$smartctl_output" | jq '.temperature.current')"
echo "nvme_current_temperature{device=\"${disk}\"} ${value_nvme_temperature}"
# #获取磁盘的容量
value_User_Capacity="$(echo "$smartctl_output_capacity" | grep "Namespace 1 Size/Capacity" | awk '{print $4}'| tr -d ',')"
nvme_User_Capacity=1
echo "User_Capacity{device=\"${disk}\", User_Capacity=\"${value_User_Capacity}\"} ${nvme_User_Capacity}"
value_power_on_time="$(echo "$smartctl_output" | jq '.power_on_time.hours')"
echo "nvme_disk_power_on_time{device=\"${disk}\"} ${value_power_on_time}"
value_available_spare="$(echo "$smartctl_output" | jq '.nvme_smart_health_information_log.available_spare / 100')"
echo "nvme_available_spare_ratio{device=\"${disk}\"} ${value_available_spare}"
value_available_spare_threshold="$(echo "$smartctl_output" | jq '.nvme_smart_health_information_log.available_spare_threshold / 100')"
echo "nvme_available_spare_threshold_ratio{device=\"${disk}\"} ${value_available_spare_threshold}"
value_percentage_used="$(echo "$smartctl_output" | jq '.nvme_smart_health_information_log.percentage_used / 100')"
echo "nvme_percentage_used{device=\"${disk}\"} ${value_percentage_used}"
value_critical_warning="$(echo "$smartctl_output" | jq '.nvme_smart_health_information_log.critical_warning')"
echo "nvme_critical_warning_total{device=\"${disk}\"} ${value_critical_warning}"
value_media_errors="$(echo "$smartctl_output" | jq '.nvme_smart_health_information_log.media_errors')"
echo "nvme_media_errors_total{device=\"${disk}\"} ${value_media_errors}"
value_num_err_log_entries="$(echo "$smartctl_output" | jq '.nvme_smart_health_information_log.num_err_log_entries')"
echo "nvme_num_err_log_entries_total{device=\"${disk}\"} ${value_num_err_log_entries}"
value_power_cycles="$(echo "$smartctl_output" | jq '.nvme_smart_health_information_log.power_cycles')"
echo "nvme_power_cycles_total{device=\"${disk}\"} ${value_power_cycles}"
value_power_on_hours="$(echo "$smartctl_output" | jq '.nvme_smart_health_information_log.power_on_hours')"
echo "nvme_power_on_hours_total{device=\"${disk}\"} ${value_power_on_hours}"
value_controller_busy_time="$(echo "$smartctl_output" | jq '.nvme_smart_health_information_log.controller_busy_time')"
echo "nvme_controller_busy_time_seconds{device=\"${disk}\"} ${value_controller_busy_time}"
value_data_units_written="$(echo "$smartctl_output" | jq '.nvme_smart_health_information_log.data_units_written')"
echo "nvme_data_units_written_total{device=\"${disk}\"} ${value_data_units_written}"
value_data_units_read="$(echo "$smartctl_output" | jq '.nvme_smart_health_information_log.data_units_read')"
echo "nvme_data_units_read_total{device=\"${disk}\"} ${value_data_units_read}"
value_host_read_commands="$(echo "$smartctl_output" | jq '.nvme_smart_health_information_log.host_reads')"
echo "nvme_host_read_commands_total{device=\"${disk}\"} ${value_host_read_commands}"
value_host_write_commands="$(echo "$smartctl_output" | jq '.nvme_smart_health_information_log.host_writes')"
echo "nvme_host_write_commands_total{device=\"${disk}\"} ${value_host_write_commands}"
fi
done | format_output