From afe52ca6af107b255f6e5aa00bc9c7a8fc1f1236 Mon Sep 17 00:00:00 2001 From: teutat3s Date: Wed, 6 Nov 2024 21:28:28 +0100 Subject: [PATCH 1/3] alertmanager: alert on high load only after 20m --- modules/prometheus/alert-rules.nix | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/prometheus/alert-rules.nix b/modules/prometheus/alert-rules.nix index 866709b7..3c1b601a 100644 --- a/modules/prometheus/alert-rules.nix +++ b/modules/prometheus/alert-rules.nix @@ -142,8 +142,8 @@ lib.mapAttrsToList cpu_using_90percent = { condition = ''100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) >= 90''; - time = "10m"; - description = "{{$labels.instance}} is running with cpu usage > 90% for at least 10 minutes: {{$value}}"; + time = "20m"; + description = "{{$labels.instance}} is running with cpu usage > 90% for at least 20 minutes: {{$value}}"; }; reboot = { From a9c5edfeb3c99443f8c8c5419d5b81e8f197b149 Mon Sep 17 00:00:00 2001 From: teutat3s Date: Tue, 12 Nov 2024 13:40:46 +0100 Subject: [PATCH 2/3] alertmanager: don't alert on high memory page faults This alert is non actionable, we still monitor high memory usage. --- modules/prometheus/alert-rules.nix | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/prometheus/alert-rules.nix b/modules/prometheus/alert-rules.nix index 3c1b601a..556071b8 100644 --- a/modules/prometheus/alert-rules.nix +++ b/modules/prometheus/alert-rules.nix @@ -234,10 +234,10 @@ lib.mapAttrsToList }; */ - host_memory_under_memory_pressure = { - condition = "rate(node_vmstat_pgmajfault[1m]) > 1000"; - description = "{{$labels.instance}}: The node is under heavy memory pressure. High rate of major page faults: {{$value}}"; - }; + #host_memory_under_memory_pressure = { + # condition = "rate(node_vmstat_pgmajfault[1m]) > 1000"; + # description = "{{$labels.instance}}: The node is under heavy memory pressure. High rate of major page faults: {{$value}}"; + #}; # ext4_errors = { # condition = "ext4_errors_value > 0"; From ab85ba751aa20ffc5624eb68766b7d8e29df0ea4 Mon Sep 17 00:00:00 2001 From: teutat3s Date: Tue, 12 Nov 2024 13:41:25 +0100 Subject: [PATCH 3/3] alertmanager: enable e2e_dead_man_switch --- modules/prometheus/alert-rules.nix | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/prometheus/alert-rules.nix b/modules/prometheus/alert-rules.nix index 556071b8..a4846732 100644 --- a/modules/prometheus/alert-rules.nix +++ b/modules/prometheus/alert-rules.nix @@ -24,10 +24,10 @@ lib.mapAttrsToList # description = "Configurations of AlertManager cluster instances are out of sync."; # }; - #alert_manager_e2e_dead_man_switch = { - # condition = "vector(1)"; - # description = "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager."; - #}; + alert_manager_e2e_dead_man_switch = { + condition = "vector(1)"; + description = "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager."; + }; # prometheus_not_connected_to_alertmanager = { # condition = "prometheus_notifications_alertmanagers_discovered < 1";