alertmanager: alert on high load only after 20m #255

Merged
teutat3s merged 3 commits from alerts-tweak-load into main 2024-11-12 14:47:54 +00:00

View file

@ -24,10 +24,10 @@ lib.mapAttrsToList
# description = "Configurations of AlertManager cluster instances are out of sync."; # description = "Configurations of AlertManager cluster instances are out of sync.";
# }; # };
#alert_manager_e2e_dead_man_switch = { alert_manager_e2e_dead_man_switch = {
# condition = "vector(1)"; condition = "vector(1)";
# description = "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager."; description = "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.";
#}; };
# prometheus_not_connected_to_alertmanager = { # prometheus_not_connected_to_alertmanager = {
# condition = "prometheus_notifications_alertmanagers_discovered < 1"; # condition = "prometheus_notifications_alertmanagers_discovered < 1";
@ -142,8 +142,8 @@ lib.mapAttrsToList
cpu_using_90percent = { cpu_using_90percent = {
condition = ''100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) >= 90''; condition = ''100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) >= 90'';
time = "10m"; time = "20m";
description = "{{$labels.instance}} is running with cpu usage > 90% for at least 10 minutes: {{$value}}"; description = "{{$labels.instance}} is running with cpu usage > 90% for at least 20 minutes: {{$value}}";
}; };
reboot = { reboot = {
@ -234,10 +234,10 @@ lib.mapAttrsToList
}; };
*/ */
host_memory_under_memory_pressure = { #host_memory_under_memory_pressure = {
hensoko marked this conversation as resolved
Review

why is this check disabled?

why is this check disabled?
condition = "rate(node_vmstat_pgmajfault[1m]) > 1000"; # condition = "rate(node_vmstat_pgmajfault[1m]) > 1000";
description = "{{$labels.instance}}: The node is under heavy memory pressure. High rate of major page faults: {{$value}}"; # description = "{{$labels.instance}}: The node is under heavy memory pressure. High rate of major page faults: {{$value}}";
}; #};
# ext4_errors = { # ext4_errors = {
# condition = "ext4_errors_value > 0"; # condition = "ext4_errors_value > 0";