diff --git a/pillar.example b/pillar.example index 5da6b94..c04e8dc 100644 --- a/pillar.example +++ b/pillar.example @@ -173,3 +173,8 @@ prometheus: # You must enable individual collectors enable: true # pkg: ipmitool + smartmon: + enable: true + # pkg: smartmontools + # bash_pkg: bash + # smartctl: /usr/sbin/smartctl diff --git a/prometheus/config/node_exporter/textfile_collectors/files/smartmon.sh.jinja b/prometheus/config/node_exporter/textfile_collectors/files/smartmon.sh.jinja new file mode 100644 index 0000000..be82f89 --- /dev/null +++ b/prometheus/config/node_exporter/textfile_collectors/files/smartmon.sh.jinja @@ -0,0 +1,204 @@ +#!/usr/bin/env bash +# Script informed by the collectd monitoring script for smartmontools (using smartctl) +# by Samuel B. (c) 2012 +# source at: http://devel.dob.sk/collectd-scripts/ + +# TODO: This probably needs to be a little more complex. The raw numbers can have more +# data in them than you'd think. +# http://arstechnica.com/civis/viewtopic.php?p=22062211 + +# Formatting done via shfmt -i 2 +# https://github.com/mvdan/sh + +parse_smartctl_attributes_awk="$( + cat <<'SMARTCTLAWK' +$1 ~ /^ *[0-9]+$/ && $2 ~ /^[a-zA-Z0-9_-]+$/ { + gsub(/-/, "_"); + printf "%s_value{{ '{' }}%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $4 + printf "%s_worst{{ '{' }}%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $5 + printf "%s_threshold{{ '{' }}%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $6 + printf "%s_raw_value{{ '{' }}%s,smart_id=\"%s\"} %e\n", $2, labels, $1, $10 +} +SMARTCTLAWK +)" + +smartmon_attrs="$( + cat <<'SMARTMONATTRS' +airflow_temperature_cel +command_timeout +current_pending_sector +end_to_end_error +erase_fail_count +g_sense_error_rate +hardware_ecc_recovered +host_reads_mib +host_reads_32mib +host_writes_mib +host_writes_32mib +load_cycle_count +media_wearout_indicator +wear_leveling_count +nand_writes_1gib +offline_uncorrectable +power_cycle_count +power_on_hours +program_fail_count +raw_read_error_rate +reallocated_event_count +reallocated_sector_ct +reported_uncorrect +sata_downshift_count +seek_error_rate +spin_retry_count +spin_up_time +start_stop_count +temperature_case +temperature_celsius +temperature_internal +total_lbas_read +total_lbas_written +udma_crc_error_count +unsafe_shutdown_count +workld_host_reads_perc +workld_media_wear_indic +workload_minutes +SMARTMONATTRS +)" +smartmon_attrs="$(echo ${smartmon_attrs} | xargs | tr ' ' '|')" + +parse_smartctl_attributes() { + local disk="$1" + local disk_type="$2" + local labels="disk=\"${disk}\",type=\"${disk_type}\"" + local vars="$(echo "${smartmon_attrs}" | xargs | tr ' ' '|')" + sed 's/^ \+//g' | + awk -v labels="${labels}" "${parse_smartctl_attributes_awk}" 2>/dev/null | + tr A-Z a-z | + grep -E "(${smartmon_attrs})" +} + +parse_smartctl_scsi_attributes() { + local disk="$1" + local disk_type="$2" + local labels="disk=\"${disk}\",type=\"${disk_type}\"" + while read line; do + attr_type="$(echo "${line}" | tr '=' ':' | cut -f1 -d: | sed 's/^ \+//g' | tr ' ' '_')" + attr_value="$(echo "${line}" | tr '=' ':' | cut -f2 -d: | sed 's/^ \+//g')" + case "${attr_type}" in + number_of_hours_powered_up_) power_on="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;; + Current_Drive_Temperature) temp_cel="$(echo ${attr_value} | cut -f1 -d' ' | awk '{ printf "%e\n", $1 }')" ;; + Blocks_read_from_cache_and_sent_to_initiator_) lbas_read="$(echo ${attr_value} | awk '{ printf "%e\n", $1 }')" ;; + Accumulated_start-stop_cycles) power_cycle="$(echo ${attr_value} | awk '{ printf "%e\n", $1 }')" ;; + Elements_in_grown_defect_list) grown_defects="$(echo ${attr_value} | awk '{ printf "%e\n", $1 }')" ;; + esac + done + [ ! -z "$power_on" ] && echo "power_on_hours_raw_value{${labels},smart_id=\"9\"} ${power_on}" + [ ! -z "$temp_cel" ] && echo "temperature_celsius_raw_value{${labels},smart_id=\"194\"} ${temp_cel}" + [ ! -z "$lbas_read" ] && echo "total_lbas_read_raw_value{${labels},smart_id=\"242\"} ${lbas_read}" + [ ! -z "$power_cycle" ] && echo "power_cycle_count_raw_value{${labels},smart_id=\"12\"} ${power_cycle}" + [ ! -z "$grown_defects" ] && echo "grown_defects_count_raw_value{${labels},smart_id=\"12\"} ${grown_defects}" +} + +parse_smartctl_info() { + local -i smart_available=0 smart_enabled=0 smart_healthy=0 + local disk="$1" disk_type="$2" + local model_family='' device_model='' serial_number='' fw_version='' vendor='' product='' revision='' lun_id='' + while read line; do + info_type="$(echo "${line}" | cut -f1 -d: | tr ' ' '_')" + info_value="$(echo "${line}" | cut -f2- -d: | sed 's/^ \+//g' | sed 's/"/\\"/')" + case "${info_type}" in + Model_Family) model_family="${info_value}" ;; + Device_Model) device_model="${info_value}" ;; + Serial_Number) serial_number="${info_value}" ;; + Firmware_Version) fw_version="${info_value}" ;; + Vendor) vendor="${info_value}" ;; + Product) product="${info_value}" ;; + Revision) revision="${info_value}" ;; + Logical_Unit_id) lun_id="${info_value}" ;; + esac + if [[ "${info_type}" == 'SMART_support_is' ]]; then + case "${info_value:0:7}" in + Enabled) smart_enabled=1 ;; + Availab) smart_available=1 ;; + Unavail) smart_available=0 ;; + esac + fi + if [[ "${info_type}" == 'SMART_overall-health_self-assessment_test_result' ]]; then + case "${info_value:0:6}" in + PASSED) smart_healthy=1 ;; + esac + elif [[ "${info_type}" == 'SMART_Health_Status' ]]; then + case "${info_value:0:2}" in + OK) smart_healthy=1 ;; + esac + fi + done + echo "device_info{\ +disk=\"${disk}\",\ +type=\"${disk_type}\",\ +vendor=\"${vendor}\",\ +product=\"${product}\",\ +revision=\"${revision}\",\ +lun_id=\"${lun_id}\",\ +model_family=\"${model_family}\",\ +device_model=\"${device_model}\",\ +serial_number=\"${serial_number}\",\ +firmware_version=\"${fw_version}\"\ +} 1" + echo "device_smart_available{disk=\"${disk}\",type=\"${disk_type}\"} ${smart_available}" + echo "device_smart_enabled{disk=\"${disk}\",type=\"${disk_type}\"} ${smart_enabled}" + echo "device_smart_healthy{disk=\"${disk}\",type=\"${disk_type}\"} ${smart_healthy}" +} + +output_format_awk="$( + cat <<'OUTPUTAWK' +BEGIN { v = "" } +v != $1 { + print "# HELP smartmon_" $1 " SMART metric " $1; + print "# TYPE smartmon_" $1 " gauge"; + v = $1 +} +{print "smartmon_" $0} +OUTPUTAWK +)" + +format_output() { + sort | + awk -F'{' "${output_format_awk}" +} + +smartctl_version="$({{ smartctl }} -V | head -n1 | awk '$1 == "smartctl" {print $2}')" + +echo "smartctl_version{version=\"${smartctl_version}\"} 1" | format_output + +if [[ "$(expr "${smartctl_version}" : '\([0-9]*\)\..*')" -lt 6 ]]; then + exit +fi + +device_list="$({{ smartctl }} --scan-open | awk '/^\/dev/{print $1 "|" $3}')" + +for device in ${device_list}; do + disk="$(echo ${device} | cut -f1 -d'|')" + type="$(echo ${device} | cut -f2 -d'|')" + active=1 + echo "smartctl_run{disk=\"${disk}\",type=\"${type}\"}" "$(TZ=UTC date '+%s')" + # Check if the device is in a low-power mode + {{ smartctl }} -n standby -d "${type}" "${disk}" > /dev/null || active=0 + echo "device_active{disk=\"${disk}\",type=\"${type}\"}" "${active}" + # Skip further metrics to prevent the disk from spinning up + test ${active} -eq 0 && continue + # Get the SMART information and health + {{ smartctl }} -i -H -d "${type}" "${disk}" | parse_smartctl_info "${disk}" "${type}" + # Get the SMART attributes + case ${type} in + atacam) {{ smartctl }} -A -d "${type}" "${disk}" | parse_smartctl_attributes "${disk}" "${type}" ;; + sat) {{ smartctl }} -A -d "${type}" "${disk}" | parse_smartctl_attributes "${disk}" "${type}" ;; + sat+megaraid*) {{ smartctl }} -A -d "${type}" "${disk}" | parse_smartctl_attributes "${disk}" "${type}" ;; + scsi) {{ smartctl }} -A -d "${type}" "${disk}" | parse_smartctl_scsi_attributes "${disk}" "${type}" ;; + megaraid*) {{ smartctl }} -A -d "${type}" "${disk}" | parse_smartctl_scsi_attributes "${disk}" "${type}" ;; + *) + echo "disk type is not sat, scsi or megaraid but ${type}" + exit + ;; + esac +done | format_output diff --git a/prometheus/config/node_exporter/textfile_collectors/smartmon/clean.sls b/prometheus/config/node_exporter/textfile_collectors/smartmon/clean.sls new file mode 100644 index 0000000..cd692c2 --- /dev/null +++ b/prometheus/config/node_exporter/textfile_collectors/smartmon/clean.sls @@ -0,0 +1,26 @@ +# -*- coding: utf-8 -*- +# vim: ft=sls + +{#- Get the `tplroot` from `tpldir` #} +{%- set tplroot = tpldir.split('/')[0] %} +{%- from tplroot ~ "/map.jinja" import prometheus with context %} + +{%- set config = prometheus.exporters.node_exporter.textfile_collectors.smartmon %} +{%- set dir = prometheus.service.node_exporter.args.get('collector.textfile.directory') %} +{%- set script = prometheus.dir.textfile_collectors ~ '/smartmon.sh' %} + +prometheus-exporters-node-textfile_collectors-smartmon-pkg: + pkg.removed: + - name: {{ config.pkg }} + +prometheus-exporters-node-textfile_collectors-smartmon-script: + file.absent: + - name: {{ script }} + +prometheus-exporters-node-textfile_collectors-smartmon-output: + file.absent: + - name: {{ dir }}/smartmon.prom + +prometheus-exporters-node-textfile_collectors-smartmon-cronjob: + cron.absent: + - identifier: prometheus-exporters-node-textfile_collectors-smartmon-cronjob diff --git a/prometheus/config/node_exporter/textfile_collectors/smartmon/init.sls b/prometheus/config/node_exporter/textfile_collectors/smartmon/init.sls new file mode 100644 index 0000000..3584c5e --- /dev/null +++ b/prometheus/config/node_exporter/textfile_collectors/smartmon/init.sls @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +# vim: ft=sls + +{#- Get the `tplroot` from `tpldir` #} +{%- set tplroot = tpldir.split('/')[0] %} +{%- from tplroot ~ "/map.jinja" import prometheus with context %} + +{%- set config = prometheus.exporters.node_exporter.textfile_collectors.smartmon %} +{%- set dir = prometheus.service.node_exporter.args.get('collector.textfile.directory') %} +{%- set script = prometheus.dir.textfile_collectors ~ '/smartmon.sh' %} + +prometheus-exporters-node-textfile_collectors-smartmon-pkg: + pkg.installed: + - name: {{ config.pkg }} + +prometheus-exporters-node-textfile_collectors-smartmon-pkg-bash: + pkg.installed: + - name: {{ config.bash_pkg }} + +prometheus-exporters-node-textfile_collectors-smartmon-script: + file.managed: + - name: {{ script }} + - source: salt://prometheus/config/node_exporter/textfile_collectors/files/smartmon.sh.jinja + - template: jinja + - context: + smartctl: {{ config.smartctl }} + - mode: 755 + - require: + - file: prometheus-node_exporter-textfile_collectors-dir + +prometheus-exporters-node-textfile_collectors-smartmon-cronjob: + cron.present: + - identifier: prometheus-exporters-node-textfile_collectors-smartmon-cronjob + - name: cd {{ dir }} && LANG=C {{ script }} > .smartmon.prom$$ && mv .smartmon.prom$$ smartmon.prom + - minute: "{{ config.get('minute', '*') }}" + - comment: Prometheus' node_exporter's smartmon textfile collector + - require: + - file: prometheus-exporters-node-textfile_collectors-smartmon-script diff --git a/prometheus/defaults.yaml b/prometheus/defaults.yaml index 10d473a..c66fffd 100644 --- a/prometheus/defaults.yaml +++ b/prometheus/defaults.yaml @@ -127,3 +127,9 @@ prometheus: enable: false remove: false pkg: ipmitool + smartmon: + enable: false + remove: false + pkg: smartmontools + bash_pkg: bash + smartctl: /usr/sbin/smartctl diff --git a/prometheus/osfamilymap.yaml b/prometheus/osfamilymap.yaml index b5243a8..fe0e820 100644 --- a/prometheus/osfamilymap.yaml +++ b/prometheus/osfamilymap.yaml @@ -68,6 +68,11 @@ FreeBSD: archive_hash: ebcd21dc25e439eed64559e89cd7da9a94073d5ff321a8a3a4214ac2ebe04e34 statsd_exporter: archive_hash: f345dff6311501f09bb5b6ba1128e925d504c6325ee97ad91a975f2be0d44da9 + exporters: + node_exporter: + textfile_collectors: + smartmon: + smartctl: /usr/local/sbin/smartctl OpenBSD: rootgroup: wheel