Add alertmanager config - part 2 #189

Merged
teutat3s merged 9 commits from alertmanager into main 2024-05-15 15:23:59 +00:00
2 changed files with 53 additions and 58 deletions
Showing only changes of commit a8a8155114 - Show all commits

View file

@ -52,11 +52,9 @@ lib.mapAttrsToList
# };
filesystem_full_80percent = {
condition = ''
100 - ((node_filesystem_avail_bytes{fstype!="rootfs",mountpoint="/"} * 100) / node_filesystem_size_bytes{fstype!="rootfs",mountpoint="/"}) > 80'';
condition = ''100 - ((node_filesystem_avail_bytes{fstype!="rootfs",mountpoint="/"} * 100) / node_filesystem_size_bytes{fstype!="rootfs",mountpoint="/"}) > 80'';
time = "10m";
description =
"{{$labels.instance}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 20% space left on its filesystem.";
description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 20% space left on its filesystem.";
};
# filesystem_inodes_full = {
@ -105,17 +103,14 @@ lib.mapAttrsToList
# };
swap_using_20percent = {
condition =
"node_memory_SwapTotal_bytes - (node_memory_SwapCached_bytes + node_memory_SwapFree_bytes) > node_memory_SwapTotal_bytes * 0.2";
condition = "node_memory_SwapTotal_bytes - (node_memory_SwapCached_bytes + node_memory_SwapFree_bytes) > node_memory_SwapTotal_bytes * 0.2";
time = "30m";
description =
"{{$labels.instance}} is using 20% of its swap space for at least 30 minutes.";
description = "{{$labels.instance}} is using 20% of its swap space for at least 30 minutes.";
};
systemd_service_failed = {
condition = ''node_systemd_unit_state{state="failed"} == 1'';
description =
"{{$labels.instance}} failed to (re)start service {{$labels.name}}.";
description = "{{$labels.instance}} failed to (re)start service {{$labels.name}}.";
};
restic_backup_too_old = {
@ -134,19 +129,15 @@ lib.mapAttrsToList
# };
ram_using_90percent = {
condition =
"node_memory_Buffers_bytes + node_memory_MemFree_bytes + node_memory_Cached_bytes < node_memory_MemTotal_bytes * 0.1";
condition = "node_memory_Buffers_bytes + node_memory_MemFree_bytes + node_memory_Cached_bytes < node_memory_MemTotal_bytes * 0.1";
time = "1h";
description =
"{{$labels.instance}} is using at least 90% of its RAM for at least 1 hour.";
description = "{{$labels.instance}} is using at least 90% of its RAM for at least 1 hour.";
};
cpu_using_90percent = {
condition = ''
100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) >= 90'';
condition = ''100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) >= 90'';
time = "10m";
description =
"{{$labels.instance}} is running with cpu usage > 90% for at least 10 minutes: {{$value}}";
description = "{{$labels.instance}} is running with cpu usage > 90% for at least 10 minutes: {{$value}}";
};
reboot = {
@ -156,18 +147,16 @@ lib.mapAttrsToList
uptime = {
condition = "(time() - node_boot_time_seconds ) / (60*60*24) > 30";
description =
"Uptime monster: {{$labels.instance}} has been up for more than 30 days.";
description = "Uptime monster: {{$labels.instance}} has been up for more than 30 days.";
};
flake_nixpkgs_outdated = {
condition = ''
(time() - flake_input_last_modified{input="nixpkgs"}) / (60*60*24) > 30'';
description =
"Nixpkgs outdated: Nixpkgs on {{$labels.instance}} has not been updated in 30 days";
condition = ''(time() - flake_input_last_modified{input="nixpkgs"}) / (60*60*24) > 30'';
description = "Nixpkgs outdated: Nixpkgs on {{$labels.instance}} has not been updated in 30 days";
};
/* ping = {
/*
ping = {
condition = "ping_result_code{type!='mobile'} != 0";
description = "{{$labels.url}}: ping from {{$labels.instance}} has failed!";
};
@ -178,12 +167,11 @@ lib.mapAttrsToList
};
*/
http_status = {
condition = ''
probe_http_status_code{instance!~"https://megaclan3000.de"} != 200'';
description =
"http request failed from {{$labels.instance}}: {{$labels.result}}!";
condition = ''probe_http_status_code{instance!~"https://megaclan3000.de"} != 200'';
description = "http request failed from {{$labels.instance}}: {{$labels.result}}!";
};
/* http_match_failed = {
/*
http_match_failed = {
condition = "http_response_response_string_match == 0";
description = "{{$labels.server}} : http body not as expected; status code: {{$labels.status_code}}!";
};
@ -206,8 +194,7 @@ lib.mapAttrsToList
*/
cert_expiry = {
condition = "(probe_ssl_earliest_cert_expiry - time())/(3600*24) < 30";
description =
"{{$labels.instance}}: The TLS certificate will expire in less than 30 days: {{$value}}s";
description = "{{$labels.instance}}: The TLS certificate will expire in less than 30 days: {{$value}}s";
};
# ignore devices that disabled S.M.A.R.T (example if attached via USB)
@ -223,7 +210,8 @@ lib.mapAttrsToList
description = "{{$labels.instance}}: OOM kill detected";
};
/* unusual_disk_read_latency = {
/*
unusual_disk_read_latency = {
condition =
"rate(diskio_read_time[1m]) / rate(diskio_reads[1m]) > 0.1 and rate(diskio_reads[1m]) > 0";
description = ''
@ -242,8 +230,7 @@ lib.mapAttrsToList
host_memory_under_memory_pressure = {
condition = "rate(node_vmstat_pgmajfault[1m]) > 1000";
description =
"{{$labels.instance}}: The node is under heavy memory pressure. High rate of major page faults: {{$value}}";
description = "{{$labels.instance}}: The node is under heavy memory pressure. High rate of major page faults: {{$value}}";
};
# ext4_errors = {

View file

@ -84,15 +84,19 @@
];
ruleFiles = [
(pkgs.writeText "prometheus-rules.yml" (builtins.toJSON {
groups = [{
name = "alerting-rules";
rules = import ./alert-rules.nix { inherit lib; };
}];
}))
(pkgs.writeText "prometheus-rules.yml" (
builtins.toJSON {
groups = [
{
name = "alerting-rules";
rules = import ./alert-rules.nix { inherit lib; };
}
];
}
))
];
alertmanagers = [{ static_configs = [{ targets = [ "localhost:9093" ]; }]; }];
alertmanagers = [ { static_configs = [ { targets = [ "localhost:9093" ]; } ]; } ];
alertmanager = {
enable = true;
@ -109,22 +113,26 @@
repeat_interval = "24h";
};
receivers = [{
name = "all";
# Email config documentation: https://prometheus.io/docs/alerting/latest/configuration/#email_config
email_configs = [{
send_resolved = true;
to = "TODO";
from = "alerts@pub.solar";
smarthost = "TODO";
auth_username = "TODO";
auth_password_file = "${config.age.secrets.nachtigall-alertmanager-smtp-password.path}";
require_tls = true;
}];
# TODO:
# For matrix notifications, look into: https://github.com/pinpox/matrix-hook and add a webhook
# webhook_configs = [ { url = "http://127.0.0.1:11000/alert"; } ];
}];
receivers = [
{
name = "all";
# Email config documentation: https://prometheus.io/docs/alerting/latest/configuration/#email_config
email_configs = [
{
send_resolved = true;
to = "TODO";
from = "alerts@pub.solar";
smarthost = "TODO";
auth_username = "TODO";
auth_password_file = "${config.age.secrets.nachtigall-alertmanager-smtp-password.path}";
require_tls = true;
}
];
# TODO:
# For matrix notifications, look into: https://github.com/pinpox/matrix-hook and add a webhook
# webhook_configs = [ { url = "http://127.0.0.1:11000/alert"; } ];
}
];
};
};
};