forked from pub-solar/infra
Merge pull request 'alertmanager: alert on high load only after 20m' (#255) from alerts-tweak-load into main
Reviewed-on: pub-solar/infra#255 Reviewed-by: hensoko <hensoko@noreply.git.pub.solar>
This commit is contained in:
commit
73333537a5
|
@ -24,10 +24,10 @@ lib.mapAttrsToList
|
||||||
# description = "Configurations of AlertManager cluster instances are out of sync.";
|
# description = "Configurations of AlertManager cluster instances are out of sync.";
|
||||||
# };
|
# };
|
||||||
|
|
||||||
#alert_manager_e2e_dead_man_switch = {
|
alert_manager_e2e_dead_man_switch = {
|
||||||
# condition = "vector(1)";
|
condition = "vector(1)";
|
||||||
# description = "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.";
|
description = "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.";
|
||||||
#};
|
};
|
||||||
|
|
||||||
# prometheus_not_connected_to_alertmanager = {
|
# prometheus_not_connected_to_alertmanager = {
|
||||||
# condition = "prometheus_notifications_alertmanagers_discovered < 1";
|
# condition = "prometheus_notifications_alertmanagers_discovered < 1";
|
||||||
|
@ -142,8 +142,8 @@ lib.mapAttrsToList
|
||||||
|
|
||||||
cpu_using_90percent = {
|
cpu_using_90percent = {
|
||||||
condition = ''100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) >= 90'';
|
condition = ''100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) >= 90'';
|
||||||
time = "10m";
|
time = "20m";
|
||||||
description = "{{$labels.instance}} is running with cpu usage > 90% for at least 10 minutes: {{$value}}";
|
description = "{{$labels.instance}} is running with cpu usage > 90% for at least 20 minutes: {{$value}}";
|
||||||
};
|
};
|
||||||
|
|
||||||
reboot = {
|
reboot = {
|
||||||
|
@ -234,10 +234,10 @@ lib.mapAttrsToList
|
||||||
};
|
};
|
||||||
*/
|
*/
|
||||||
|
|
||||||
host_memory_under_memory_pressure = {
|
#host_memory_under_memory_pressure = {
|
||||||
condition = "rate(node_vmstat_pgmajfault[1m]) > 1000";
|
# condition = "rate(node_vmstat_pgmajfault[1m]) > 1000";
|
||||||
description = "{{$labels.instance}}: The node is under heavy memory pressure. High rate of major page faults: {{$value}}";
|
# description = "{{$labels.instance}}: The node is under heavy memory pressure. High rate of major page faults: {{$value}}";
|
||||||
};
|
#};
|
||||||
|
|
||||||
# ext4_errors = {
|
# ext4_errors = {
|
||||||
# condition = "ext4_errors_value > 0";
|
# condition = "ext4_errors_value > 0";
|
||||||
|
|
Loading…
Reference in a new issue