Add alertmanager config - part 2 #189

Merged
teutat3s merged 9 commits from alertmanager into main 2024-05-15 15:23:59 +00:00
2 changed files with 53 additions and 58 deletions
Showing only changes of commit a8a8155114 - Show all commits

View file

@ -52,11 +52,9 @@ lib.mapAttrsToList
# }; # };
filesystem_full_80percent = { filesystem_full_80percent = {
condition = '' condition = ''100 - ((node_filesystem_avail_bytes{fstype!="rootfs",mountpoint="/"} * 100) / node_filesystem_size_bytes{fstype!="rootfs",mountpoint="/"}) > 80'';
100 - ((node_filesystem_avail_bytes{fstype!="rootfs",mountpoint="/"} * 100) / node_filesystem_size_bytes{fstype!="rootfs",mountpoint="/"}) > 80'';
time = "10m"; time = "10m";
description = description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 20% space left on its filesystem.";
"{{$labels.instance}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 20% space left on its filesystem.";
}; };
# filesystem_inodes_full = { # filesystem_inodes_full = {
@ -105,17 +103,14 @@ lib.mapAttrsToList
# }; # };
swap_using_20percent = { swap_using_20percent = {
condition = condition = "node_memory_SwapTotal_bytes - (node_memory_SwapCached_bytes + node_memory_SwapFree_bytes) > node_memory_SwapTotal_bytes * 0.2";
"node_memory_SwapTotal_bytes - (node_memory_SwapCached_bytes + node_memory_SwapFree_bytes) > node_memory_SwapTotal_bytes * 0.2";
time = "30m"; time = "30m";
description = description = "{{$labels.instance}} is using 20% of its swap space for at least 30 minutes.";
"{{$labels.instance}} is using 20% of its swap space for at least 30 minutes.";
}; };
systemd_service_failed = { systemd_service_failed = {
condition = ''node_systemd_unit_state{state="failed"} == 1''; condition = ''node_systemd_unit_state{state="failed"} == 1'';
description = description = "{{$labels.instance}} failed to (re)start service {{$labels.name}}.";
"{{$labels.instance}} failed to (re)start service {{$labels.name}}.";
}; };
restic_backup_too_old = { restic_backup_too_old = {
@ -134,19 +129,15 @@ lib.mapAttrsToList
# }; # };
ram_using_90percent = { ram_using_90percent = {
condition = condition = "node_memory_Buffers_bytes + node_memory_MemFree_bytes + node_memory_Cached_bytes < node_memory_MemTotal_bytes * 0.1";
"node_memory_Buffers_bytes + node_memory_MemFree_bytes + node_memory_Cached_bytes < node_memory_MemTotal_bytes * 0.1";
time = "1h"; time = "1h";
description = description = "{{$labels.instance}} is using at least 90% of its RAM for at least 1 hour.";
"{{$labels.instance}} is using at least 90% of its RAM for at least 1 hour.";
}; };
cpu_using_90percent = { cpu_using_90percent = {
condition = '' condition = ''100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) >= 90'';
100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) >= 90'';
time = "10m"; time = "10m";
description = description = "{{$labels.instance}} is running with cpu usage > 90% for at least 10 minutes: {{$value}}";
"{{$labels.instance}} is running with cpu usage > 90% for at least 10 minutes: {{$value}}";
}; };
reboot = { reboot = {
@ -156,18 +147,16 @@ lib.mapAttrsToList
uptime = { uptime = {
condition = "(time() - node_boot_time_seconds ) / (60*60*24) > 30"; condition = "(time() - node_boot_time_seconds ) / (60*60*24) > 30";
description = description = "Uptime monster: {{$labels.instance}} has been up for more than 30 days.";
"Uptime monster: {{$labels.instance}} has been up for more than 30 days.";
}; };
flake_nixpkgs_outdated = { flake_nixpkgs_outdated = {
condition = '' condition = ''(time() - flake_input_last_modified{input="nixpkgs"}) / (60*60*24) > 30'';
(time() - flake_input_last_modified{input="nixpkgs"}) / (60*60*24) > 30''; description = "Nixpkgs outdated: Nixpkgs on {{$labels.instance}} has not been updated in 30 days";
description =
"Nixpkgs outdated: Nixpkgs on {{$labels.instance}} has not been updated in 30 days";
}; };
/* ping = { /*
ping = {
condition = "ping_result_code{type!='mobile'} != 0"; condition = "ping_result_code{type!='mobile'} != 0";
description = "{{$labels.url}}: ping from {{$labels.instance}} has failed!"; description = "{{$labels.url}}: ping from {{$labels.instance}} has failed!";
}; };
@ -178,12 +167,11 @@ lib.mapAttrsToList
}; };
*/ */
http_status = { http_status = {
condition = '' condition = ''probe_http_status_code{instance!~"https://megaclan3000.de"} != 200'';
probe_http_status_code{instance!~"https://megaclan3000.de"} != 200''; description = "http request failed from {{$labels.instance}}: {{$labels.result}}!";
description =
"http request failed from {{$labels.instance}}: {{$labels.result}}!";
}; };
/* http_match_failed = { /*
http_match_failed = {
condition = "http_response_response_string_match == 0"; condition = "http_response_response_string_match == 0";
description = "{{$labels.server}} : http body not as expected; status code: {{$labels.status_code}}!"; description = "{{$labels.server}} : http body not as expected; status code: {{$labels.status_code}}!";
}; };
@ -206,8 +194,7 @@ lib.mapAttrsToList
*/ */
cert_expiry = { cert_expiry = {
condition = "(probe_ssl_earliest_cert_expiry - time())/(3600*24) < 30"; condition = "(probe_ssl_earliest_cert_expiry - time())/(3600*24) < 30";
description = description = "{{$labels.instance}}: The TLS certificate will expire in less than 30 days: {{$value}}s";
"{{$labels.instance}}: The TLS certificate will expire in less than 30 days: {{$value}}s";
}; };
# ignore devices that disabled S.M.A.R.T (example if attached via USB) # ignore devices that disabled S.M.A.R.T (example if attached via USB)
@ -223,7 +210,8 @@ lib.mapAttrsToList
description = "{{$labels.instance}}: OOM kill detected"; description = "{{$labels.instance}}: OOM kill detected";
}; };
/* unusual_disk_read_latency = { /*
unusual_disk_read_latency = {
condition = condition =
"rate(diskio_read_time[1m]) / rate(diskio_reads[1m]) > 0.1 and rate(diskio_reads[1m]) > 0"; "rate(diskio_read_time[1m]) / rate(diskio_reads[1m]) > 0.1 and rate(diskio_reads[1m]) > 0";
description = '' description = ''
@ -242,8 +230,7 @@ lib.mapAttrsToList
host_memory_under_memory_pressure = { host_memory_under_memory_pressure = {
condition = "rate(node_vmstat_pgmajfault[1m]) > 1000"; condition = "rate(node_vmstat_pgmajfault[1m]) > 1000";
description = description = "{{$labels.instance}}: The node is under heavy memory pressure. High rate of major page faults: {{$value}}";
"{{$labels.instance}}: The node is under heavy memory pressure. High rate of major page faults: {{$value}}";
}; };
# ext4_errors = { # ext4_errors = {

View file

@ -84,12 +84,16 @@
]; ];
ruleFiles = [ ruleFiles = [
(pkgs.writeText "prometheus-rules.yml" (builtins.toJSON { (pkgs.writeText "prometheus-rules.yml" (
groups = [{ builtins.toJSON {
groups = [
{
name = "alerting-rules"; name = "alerting-rules";
rules = import ./alert-rules.nix { inherit lib; }; rules = import ./alert-rules.nix { inherit lib; };
}]; }
})) ];
}
))
]; ];
alertmanagers = [ { static_configs = [ { targets = [ "localhost:9093" ]; } ]; } ]; alertmanagers = [ { static_configs = [ { targets = [ "localhost:9093" ]; } ]; } ];
@ -109,10 +113,12 @@
repeat_interval = "24h"; repeat_interval = "24h";
}; };
receivers = [{ receivers = [
{
name = "all"; name = "all";
# Email config documentation: https://prometheus.io/docs/alerting/latest/configuration/#email_config # Email config documentation: https://prometheus.io/docs/alerting/latest/configuration/#email_config
email_configs = [{ email_configs = [
{
send_resolved = true; send_resolved = true;
to = "TODO"; to = "TODO";
from = "alerts@pub.solar"; from = "alerts@pub.solar";
@ -120,11 +126,13 @@
auth_username = "TODO"; auth_username = "TODO";
auth_password_file = "${config.age.secrets.nachtigall-alertmanager-smtp-password.path}"; auth_password_file = "${config.age.secrets.nachtigall-alertmanager-smtp-password.path}";
require_tls = true; require_tls = true;
}]; }
];
# TODO: # TODO:
# For matrix notifications, look into: https://github.com/pinpox/matrix-hook and add a webhook # For matrix notifications, look into: https://github.com/pinpox/matrix-hook and add a webhook
# webhook_configs = [ { url = "http://127.0.0.1:11000/alert"; } ]; # webhook_configs = [ { url = "http://127.0.0.1:11000/alert"; } ];
}]; }
];
}; };
}; };
}; };