modules/nixos/monitoring/alert-rules: various

- instance -> host

- simplify telegraf_down

- also exclude darwin from load15
This commit is contained in:
zowoq 2023-08-19 11:25:48 +10:00
parent 161833f265
commit d883b923d5

View file

@ -52,13 +52,13 @@ lib.mapAttrsToList
filesystem_full_80percent = { filesystem_full_80percent = {
condition = ''disk_used_percent{mode!="ro"} >= 80''; condition = ''disk_used_percent{mode!="ro"} >= 80'';
time = "10m"; time = "10m";
description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.path}} got less than 20% space left on its filesystem"; description = "{{$labels.host}} device {{$labels.device}} on {{$labels.path}} got less than 20% space left on its filesystem";
}; };
filesystem_inodes_full = { filesystem_inodes_full = {
condition = ''disk_inodes_free / disk_inodes_total < 0.10''; condition = ''disk_inodes_free / disk_inodes_total < 0.10'';
time = "10m"; time = "10m";
description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.path}} got less than 10% inodes left on its filesystem"; description = "{{$labels.host}} device {{$labels.device}} on {{$labels.path}} got less than 10% inodes left on its filesystem";
}; };
daily_task_not_run = { daily_task_not_run = {
@ -101,7 +101,7 @@ lib.mapAttrsToList
}; };
load15 = { load15 = {
condition = ''system_load15 / system_n_cpus{instance!~"build.*.nix-community.org:9273"} >= 2.0''; condition = ''system_load15 / system_n_cpus{host!~"(build|darwin).*"} >= 2.0'';
time = "10m"; time = "10m";
description = "{{$labels.host}} is running with load15 > 1 for at least 5 minutes: {{$value}}"; description = "{{$labels.host}} is running with load15 > 1 for at least 5 minutes: {{$value}}";
}; };
@ -122,14 +122,14 @@ lib.mapAttrsToList
}; };
telegraf_down = { telegraf_down = {
condition = ''min(up{job=~"telegraf"}) by (source, job, instance, org) == 0''; condition = ''min(up{job=~"telegraf"}) by (job, instance, org) == 0'';
time = "3m"; time = "3m";
description = "{{$labels.instance}}: {{$labels.job}} telegraf exporter from {{$labels.source}} is down"; description = "{{$labels.host}}: telegraf exporter is down";
}; };
http = { http = {
condition = "http_response_result_code != 0"; condition = "http_response_result_code != 0";
description = "{{$labels.server}} : http request failed from {{$labels.instance}}: {{$labels.result}}"; description = "{{$labels.server}} : http request failed from {{$labels.host}}: {{$labels.result}}";
}; };
http_match_failed = { http_match_failed = {
@ -139,44 +139,44 @@ lib.mapAttrsToList
connection_failed = { connection_failed = {
condition = "net_response_result_code != 0"; condition = "net_response_result_code != 0";
description = "{{$labels.server}}: connection to {{$labels.port}}({{$labels.protocol}}) failed from {{$labels.instance}}"; description = "{{$labels.server}}: connection to {{$labels.port}}({{$labels.protocol}}) failed from {{$labels.host}}";
}; };
zfs_errors = { zfs_errors = {
condition = "zfs_arcstats_l2_io_error + zfs_dmu_tx_error + zfs_arcstats_l2_writes_error > 0"; condition = "zfs_arcstats_l2_io_error + zfs_dmu_tx_error + zfs_arcstats_l2_writes_error > 0";
description = "{{$labels.instance}} reports: {{$value}} ZFS IO errors"; description = "{{$labels.host}} reports: {{$value}} ZFS IO errors";
}; };
zpool_status = { zpool_status = {
condition = "zpool_status_errors > 0"; condition = "zpool_status_errors > 0";
description = "{{$labels.instance}} reports: zpool {{$labels.name}} has {{$value}} errors"; description = "{{$labels.host}} reports: zpool {{$labels.name}} has {{$value}} errors";
}; };
mdraid_degraded_disks = { mdraid_degraded_disks = {
condition = "mdstat_degraded_disks > 0"; condition = "mdstat_degraded_disks > 0";
description = "{{$labels.instance}}: raid {{$labels.dev}} has failed disks"; description = "{{$labels.host}}: raid {{$labels.dev}} has failed disks";
}; };
# ignore devices that disabled S.M.A.R.T (example if attached via USB) # ignore devices that disabled S.M.A.R.T (example if attached via USB)
# Also ignore build02, build03 # Also ignore build02, build03
smart_errors = { smart_errors = {
condition = ''smart_device_health_ok{enabled!="Disabled", instance!~"(build02|build03).nix-community.org:9273"} != 1''; condition = ''smart_device_health_ok{enabled!="Disabled", host!~"(build02|build03)"} != 1'';
description = "{{$labels.instance}}: S.M.A.R.T reports: {{$labels.device}} ({{$labels.model}}) has errors"; description = "{{$labels.host}}: S.M.A.R.T reports: {{$labels.device}} ({{$labels.model}}) has errors";
}; };
oom_kills = { oom_kills = {
condition = "increase(kernel_vmstat_oom_kill[5m]) > 0"; condition = "increase(kernel_vmstat_oom_kill[5m]) > 0";
description = "{{$labels.instance}}: OOM kill detected"; description = "{{$labels.host}}: OOM kill detected";
}; };
unusual_disk_read_latency = { unusual_disk_read_latency = {
condition = "rate(diskio_read_time[1m]) / rate(diskio_reads[1m]) > 0.1 and rate(diskio_reads[1m]) > 0"; condition = "rate(diskio_read_time[1m]) / rate(diskio_reads[1m]) > 0.1 and rate(diskio_reads[1m]) > 0";
description = "{{$labels.instance}}: Disk latency is growing (read operations > 100ms)"; description = "{{$labels.host}}: Disk latency is growing (read operations > 100ms)";
}; };
unusual_disk_write_latency = { unusual_disk_write_latency = {
condition = "rate(diskio_write_time[1m]) / rate(diskio_write[1m]) > 0.1 and rate(diskio_write[1m]) > 0"; condition = "rate(diskio_write_time[1m]) / rate(diskio_write[1m]) > 0.1 and rate(diskio_write[1m]) > 0";
description = "{{$labels.instance}}: Disk latency is growing (write operations > 100ms)"; description = "{{$labels.host}}: Disk latency is growing (write operations > 100ms)";
}; };
ipv6_dad_check = { ipv6_dad_check = {
@ -186,12 +186,12 @@ lib.mapAttrsToList
host_memory_under_memory_pressure = { host_memory_under_memory_pressure = {
condition = "rate(node_vmstat_pgmajfault[1m]) > 1000"; condition = "rate(node_vmstat_pgmajfault[1m]) > 1000";
description = "{{$labels.instance}}: The node is under heavy memory pressure. High rate of major page faults: {{$value}}"; description = "{{$labels.host}}: The node is under heavy memory pressure. High rate of major page faults: {{$value}}";
}; };
ext4_errors = { ext4_errors = {
condition = "ext4_errors_value > 0"; condition = "ext4_errors_value > 0";
description = "{{$labels.instance}}: ext4 has reported {{$value}} I/O errors: check /sys/fs/ext4/*/errors_count"; description = "{{$labels.host}}: ext4 has reported {{$value}} I/O errors: check /sys/fs/ext4/*/errors_count";
}; };
alerts_silences_changed = { alerts_silences_changed = {