Merge pull request 'alertmanager: alert on high load only after 20m' (#255) from alerts-tweak-load into main

Reviewed-on: pub-solar/infra#255
Reviewed-by: hensoko <hensoko@noreply.git.pub.solar>
This commit is contained in:
teutat3s 2024-11-12 14:47:53 +00:00
commit 73333537a5
Signed by: pub.solar gitea
GPG key ID: F0332B04B7054873

View file

@ -24,10 +24,10 @@ lib.mapAttrsToList
# description = "Configurations of AlertManager cluster instances are out of sync."; # description = "Configurations of AlertManager cluster instances are out of sync.";
# }; # };
#alert_manager_e2e_dead_man_switch = { alert_manager_e2e_dead_man_switch = {
# condition = "vector(1)"; condition = "vector(1)";
# description = "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager."; description = "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.";
#}; };
# prometheus_not_connected_to_alertmanager = { # prometheus_not_connected_to_alertmanager = {
# condition = "prometheus_notifications_alertmanagers_discovered < 1"; # condition = "prometheus_notifications_alertmanagers_discovered < 1";
@ -142,8 +142,8 @@ lib.mapAttrsToList
cpu_using_90percent = { cpu_using_90percent = {
condition = ''100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) >= 90''; condition = ''100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) >= 90'';
time = "10m"; time = "20m";
description = "{{$labels.instance}} is running with cpu usage > 90% for at least 10 minutes: {{$value}}"; description = "{{$labels.instance}} is running with cpu usage > 90% for at least 20 minutes: {{$value}}";
}; };
reboot = { reboot = {
@ -234,10 +234,10 @@ lib.mapAttrsToList
}; };
*/ */
host_memory_under_memory_pressure = { #host_memory_under_memory_pressure = {
condition = "rate(node_vmstat_pgmajfault[1m]) > 1000"; # condition = "rate(node_vmstat_pgmajfault[1m]) > 1000";
description = "{{$labels.instance}}: The node is under heavy memory pressure. High rate of major page faults: {{$value}}"; # description = "{{$labels.instance}}: The node is under heavy memory pressure. High rate of major page faults: {{$value}}";
}; #};
# ext4_errors = { # ext4_errors = {
# condition = "ext4_errors_value > 0"; # condition = "ext4_errors_value > 0";