diff --git a/build01/configuration.nix b/build01/configuration.nix index 3b007a4..b434d35 100644 --- a/build01/configuration.nix +++ b/build01/configuration.nix @@ -26,7 +26,7 @@ in ../profiles/common.nix ../profiles/docker.nix - + ../profiles/telegraf ../services/hound ] ++ userImports; diff --git a/profiles/telegraf/default.nix b/profiles/telegraf/default.nix new file mode 100644 index 0000000..c82a6ac --- /dev/null +++ b/profiles/telegraf/default.nix @@ -0,0 +1,83 @@ +{ pkgs, lib, config, ... }: let + isVM = lib.any (mod: mod == "xen-blkfront" || mod == "virtio_console") config.boot.initrd.kernelModules; +in { + config = { + systemd.services.telegraf.path = [ pkgs.nvme-cli ]; + networking.firewall.allowedTCPPorts = [ 9273 ]; + + services.telegraf = { + enable = true; + extraConfig = { + agent.interval = "60s"; + inputs = { + kernel_vmstat = {}; + smart = lib.mkIf (!isVM) { + path = pkgs.writeShellScript "smartctl" '' + exec /run/wrappers/bin/sudo ${pkgs.smartmontools}/bin/smartctl "$@" + ''; + }; + system = {}; + mem = {}; + file = [{ + data_format = "influx"; + file_tag = "name"; + files = [ "/var/log/telegraf/*" ]; + }] ++ lib.optional (lib.any (fs: fs == "ext4") config.boot.supportedFilesystems) { + name_override = "ext4_errors"; + files = [ "/sys/fs/ext4/*/errors_count" ]; + data_format = "value"; + }; + exec = lib.optionalAttrs (lib.any (fs: fs == "zfs") config.boot.supportedFilesystems) { + ## Commands array + commands = [ + (pkgs.writeScript "zpool-health" '' + #!${pkgs.gawk}/bin/awk -f + BEGIN { + while ("${pkgs.zfs}/bin/zpool status" | getline) { + if ($1 ~ /pool:/) { printf "zpool_status,name=%s ", $2 } + if ($1 ~ /state:/) { printf " state=\"%s\",", $2 } + if ($1 ~ /errors:/) { + if (index($2, "No")) printf "errors=0i\n"; else printf "errors=%di\n", $2 + } + } + } + '') + (pkgs.runCommandNoCC "mdraid-health" { + buildInputs = [ pkgs.bash ]; + } '' + install -m755 ${./mdraid-health.sh} $out + patchShebangs $out + '') + ]; + data_format = "influx"; + }; + systemd_units = {}; + swap = {}; + disk.tagdrop = { + fstype = [ "tmpfs" "ramfs" "devtmpfs" "devfs" "iso9660" "overlay" "aufs" "squashfs" ]; + device = [ "rpc_pipefs" "lxcfs" "nsfs" "borgfs" ]; + }; + }; + outputs.prometheus_client = { + listen = ":9273"; + metric_version = 2; + }; + }; + }; + security.sudo.extraRules = lib.mkIf (!isVM) [{ + users = [ "telegraf" ]; + commands = [ { + command = "${pkgs.smartmontools}/bin/smartctl"; + options = [ "NOPASSWD" ]; + }]; + }]; + # avoid logging sudo use + security.sudo.configFile = '' + Defaults:telegraf !syslog,!pam_session + ''; + # create dummy file to avoid telegraf errors + systemd.tmpfiles.rules = [ + "f /var/log/telegraf/dummy 0444 root root - -" + ]; + }; +} diff --git a/profiles/telegraf/mdraid-health.sh b/profiles/telegraf/mdraid-health.sh new file mode 100755 index 0000000..6c2fbaa --- /dev/null +++ b/profiles/telegraf/mdraid-health.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash +# +# telegraf plugin for mdstat monitoring +# +# output fields: +# - measurement: mdstat +# - tags: +# - host: hostname +# - dev: md device name (md0, md1 etc) +# - fields: +# - mismatch_cnt: number of mismatched sectors during latest check +# - state: clear / inactive / suspended / readonly / read-auto / clean / active / write-pending / active-idle +# - active_disks: number of active disks in array +# - degraded_disks: number of faulty disks in array (0 = healthy) +# - total_disks: number of disks in array +# +# see https://www.kernel.org/doc/html/v4.15/admin-guide/md.html for more +# +# state values: +# - clear: no devices, no size, no level +# - inactive: may have some settings, but array is not active all IO results in error +# - suspended: all IO requests will block. the array can be reconfigured. +# - readonly: no resync can happen. no superblocks get written. Write requests fail +# - read-auto: like readonly, but behaves like clean on a write request. +# - clean: no pending writes, but otherwise active. +# - active: fully active: IO and resync can be happening. when written to inactive array, starts with resync +# - write-pending: clean, but writes are blocked waiting for active to be written. +# - active-idle: like active, but no writes have been seen for a while +# +# +# sample /proc/mdstat +# +# Personalities : [raid1] [linear] [multipath] [raid0] [raid6] [raid5] [raid4] [raid10] +# md2 : active raid1 sdb3[1] sda3[0] +# 1936079936 blocks super 1.2 [2/2] [UU] +# bitmap: 2/15 pages [8KB], 65536KB chunk +# +# md1 : active raid1 sdb2[1] sda2[0] +# 523712 blocks super 1.2 [2/2] [UU] +# +# md0 : active raid1 sdb1[1] sda1[0] +# 16760832 blocks super 1.2 [2/2] [UU] +# +# sample /sys/block/md0/uevent +# +# MAJOR=9 +# MINOR=0 +# DEVNAME=md0 +# DEVTYPE=disk + +HOST=$(< /proc/sys/kernel/hostname) + +for MD_SYS_FOLDER in /sys/block/md*; do + eval $(< "${MD_SYS_FOLDER}/uevent") + + MD_DEV=${DEVNAME} + [ -z "${MD_DEV}" ] && continue + + MISMATCH_CNT=$(< "${MD_SYS_FOLDER}/md/mismatch_cnt") + STATE=$(< "${MD_SYS_FOLDER}/md/array_state") + DEGRADED_DISKS=$(< "${MD_SYS_FOLDER}/md/degraded") + TOTAL_DISKS=$(< "${MD_SYS_FOLDER}/md/raid_disks") + ACTIVE_DISKS=$(expr $TOTAL_DISKS - $DEGRADED_DISKS) + + echo "mdstat,host=${HOST},dev=${MD_DEV} mismatch_cnt=${MISMATCH_CNT}i,state=\"${STATE}\",active_disks=${ACTIVE_DISKS}i,degraded_disks=${DEGRADED_DISKS}i,total_disks=${TOTAL_DISKS}i" +done