modules/nixos/monitoring/alert-rules: various
- instance -> host - simplify telegraf_down - also exclude darwin from load15
This commit is contained in:
parent
161833f265
commit
d883b923d5
1 changed files with 17 additions and 17 deletions
|
@ -52,13 +52,13 @@ lib.mapAttrsToList
|
||||||
filesystem_full_80percent = {
|
filesystem_full_80percent = {
|
||||||
condition = ''disk_used_percent{mode!="ro"} >= 80'';
|
condition = ''disk_used_percent{mode!="ro"} >= 80'';
|
||||||
time = "10m";
|
time = "10m";
|
||||||
description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.path}} got less than 20% space left on its filesystem";
|
description = "{{$labels.host}} device {{$labels.device}} on {{$labels.path}} got less than 20% space left on its filesystem";
|
||||||
};
|
};
|
||||||
|
|
||||||
filesystem_inodes_full = {
|
filesystem_inodes_full = {
|
||||||
condition = ''disk_inodes_free / disk_inodes_total < 0.10'';
|
condition = ''disk_inodes_free / disk_inodes_total < 0.10'';
|
||||||
time = "10m";
|
time = "10m";
|
||||||
description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.path}} got less than 10% inodes left on its filesystem";
|
description = "{{$labels.host}} device {{$labels.device}} on {{$labels.path}} got less than 10% inodes left on its filesystem";
|
||||||
};
|
};
|
||||||
|
|
||||||
daily_task_not_run = {
|
daily_task_not_run = {
|
||||||
|
@ -101,7 +101,7 @@ lib.mapAttrsToList
|
||||||
};
|
};
|
||||||
|
|
||||||
load15 = {
|
load15 = {
|
||||||
condition = ''system_load15 / system_n_cpus{instance!~"build.*.nix-community.org:9273"} >= 2.0'';
|
condition = ''system_load15 / system_n_cpus{host!~"(build|darwin).*"} >= 2.0'';
|
||||||
time = "10m";
|
time = "10m";
|
||||||
description = "{{$labels.host}} is running with load15 > 1 for at least 5 minutes: {{$value}}";
|
description = "{{$labels.host}} is running with load15 > 1 for at least 5 minutes: {{$value}}";
|
||||||
};
|
};
|
||||||
|
@ -122,14 +122,14 @@ lib.mapAttrsToList
|
||||||
};
|
};
|
||||||
|
|
||||||
telegraf_down = {
|
telegraf_down = {
|
||||||
condition = ''min(up{job=~"telegraf"}) by (source, job, instance, org) == 0'';
|
condition = ''min(up{job=~"telegraf"}) by (job, instance, org) == 0'';
|
||||||
time = "3m";
|
time = "3m";
|
||||||
description = "{{$labels.instance}}: {{$labels.job}} telegraf exporter from {{$labels.source}} is down";
|
description = "{{$labels.host}}: telegraf exporter is down";
|
||||||
};
|
};
|
||||||
|
|
||||||
http = {
|
http = {
|
||||||
condition = "http_response_result_code != 0";
|
condition = "http_response_result_code != 0";
|
||||||
description = "{{$labels.server}} : http request failed from {{$labels.instance}}: {{$labels.result}}";
|
description = "{{$labels.server}} : http request failed from {{$labels.host}}: {{$labels.result}}";
|
||||||
};
|
};
|
||||||
|
|
||||||
http_match_failed = {
|
http_match_failed = {
|
||||||
|
@ -139,44 +139,44 @@ lib.mapAttrsToList
|
||||||
|
|
||||||
connection_failed = {
|
connection_failed = {
|
||||||
condition = "net_response_result_code != 0";
|
condition = "net_response_result_code != 0";
|
||||||
description = "{{$labels.server}}: connection to {{$labels.port}}({{$labels.protocol}}) failed from {{$labels.instance}}";
|
description = "{{$labels.server}}: connection to {{$labels.port}}({{$labels.protocol}}) failed from {{$labels.host}}";
|
||||||
};
|
};
|
||||||
|
|
||||||
zfs_errors = {
|
zfs_errors = {
|
||||||
condition = "zfs_arcstats_l2_io_error + zfs_dmu_tx_error + zfs_arcstats_l2_writes_error > 0";
|
condition = "zfs_arcstats_l2_io_error + zfs_dmu_tx_error + zfs_arcstats_l2_writes_error > 0";
|
||||||
description = "{{$labels.instance}} reports: {{$value}} ZFS IO errors";
|
description = "{{$labels.host}} reports: {{$value}} ZFS IO errors";
|
||||||
};
|
};
|
||||||
|
|
||||||
zpool_status = {
|
zpool_status = {
|
||||||
condition = "zpool_status_errors > 0";
|
condition = "zpool_status_errors > 0";
|
||||||
description = "{{$labels.instance}} reports: zpool {{$labels.name}} has {{$value}} errors";
|
description = "{{$labels.host}} reports: zpool {{$labels.name}} has {{$value}} errors";
|
||||||
};
|
};
|
||||||
|
|
||||||
mdraid_degraded_disks = {
|
mdraid_degraded_disks = {
|
||||||
condition = "mdstat_degraded_disks > 0";
|
condition = "mdstat_degraded_disks > 0";
|
||||||
description = "{{$labels.instance}}: raid {{$labels.dev}} has failed disks";
|
description = "{{$labels.host}}: raid {{$labels.dev}} has failed disks";
|
||||||
};
|
};
|
||||||
|
|
||||||
# ignore devices that disabled S.M.A.R.T (example if attached via USB)
|
# ignore devices that disabled S.M.A.R.T (example if attached via USB)
|
||||||
# Also ignore build02, build03
|
# Also ignore build02, build03
|
||||||
smart_errors = {
|
smart_errors = {
|
||||||
condition = ''smart_device_health_ok{enabled!="Disabled", instance!~"(build02|build03).nix-community.org:9273"} != 1'';
|
condition = ''smart_device_health_ok{enabled!="Disabled", host!~"(build02|build03)"} != 1'';
|
||||||
description = "{{$labels.instance}}: S.M.A.R.T reports: {{$labels.device}} ({{$labels.model}}) has errors";
|
description = "{{$labels.host}}: S.M.A.R.T reports: {{$labels.device}} ({{$labels.model}}) has errors";
|
||||||
};
|
};
|
||||||
|
|
||||||
oom_kills = {
|
oom_kills = {
|
||||||
condition = "increase(kernel_vmstat_oom_kill[5m]) > 0";
|
condition = "increase(kernel_vmstat_oom_kill[5m]) > 0";
|
||||||
description = "{{$labels.instance}}: OOM kill detected";
|
description = "{{$labels.host}}: OOM kill detected";
|
||||||
};
|
};
|
||||||
|
|
||||||
unusual_disk_read_latency = {
|
unusual_disk_read_latency = {
|
||||||
condition = "rate(diskio_read_time[1m]) / rate(diskio_reads[1m]) > 0.1 and rate(diskio_reads[1m]) > 0";
|
condition = "rate(diskio_read_time[1m]) / rate(diskio_reads[1m]) > 0.1 and rate(diskio_reads[1m]) > 0";
|
||||||
description = "{{$labels.instance}}: Disk latency is growing (read operations > 100ms)";
|
description = "{{$labels.host}}: Disk latency is growing (read operations > 100ms)";
|
||||||
};
|
};
|
||||||
|
|
||||||
unusual_disk_write_latency = {
|
unusual_disk_write_latency = {
|
||||||
condition = "rate(diskio_write_time[1m]) / rate(diskio_write[1m]) > 0.1 and rate(diskio_write[1m]) > 0";
|
condition = "rate(diskio_write_time[1m]) / rate(diskio_write[1m]) > 0.1 and rate(diskio_write[1m]) > 0";
|
||||||
description = "{{$labels.instance}}: Disk latency is growing (write operations > 100ms)";
|
description = "{{$labels.host}}: Disk latency is growing (write operations > 100ms)";
|
||||||
};
|
};
|
||||||
|
|
||||||
ipv6_dad_check = {
|
ipv6_dad_check = {
|
||||||
|
@ -186,12 +186,12 @@ lib.mapAttrsToList
|
||||||
|
|
||||||
host_memory_under_memory_pressure = {
|
host_memory_under_memory_pressure = {
|
||||||
condition = "rate(node_vmstat_pgmajfault[1m]) > 1000";
|
condition = "rate(node_vmstat_pgmajfault[1m]) > 1000";
|
||||||
description = "{{$labels.instance}}: The node is under heavy memory pressure. High rate of major page faults: {{$value}}";
|
description = "{{$labels.host}}: The node is under heavy memory pressure. High rate of major page faults: {{$value}}";
|
||||||
};
|
};
|
||||||
|
|
||||||
ext4_errors = {
|
ext4_errors = {
|
||||||
condition = "ext4_errors_value > 0";
|
condition = "ext4_errors_value > 0";
|
||||||
description = "{{$labels.instance}}: ext4 has reported {{$value}} I/O errors: check /sys/fs/ext4/*/errors_count";
|
description = "{{$labels.host}}: ext4 has reported {{$value}} I/O errors: check /sys/fs/ext4/*/errors_count";
|
||||||
};
|
};
|
||||||
|
|
||||||
alerts_silences_changed = {
|
alerts_silences_changed = {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue