diff --git a/modules/prometheus/alert-rules.nix b/modules/prometheus/alert-rules.nix index 45613883..23f66ab5 100644 --- a/modules/prometheus/alert-rules.nix +++ b/modules/prometheus/alert-rules.nix @@ -52,11 +52,9 @@ lib.mapAttrsToList # }; filesystem_full_80percent = { - condition = '' - 100 - ((node_filesystem_avail_bytes{fstype!="rootfs",mountpoint="/"} * 100) / node_filesystem_size_bytes{fstype!="rootfs",mountpoint="/"}) > 80''; + condition = ''100 - ((node_filesystem_avail_bytes{fstype!="rootfs",mountpoint="/"} * 100) / node_filesystem_size_bytes{fstype!="rootfs",mountpoint="/"}) > 80''; time = "10m"; - description = - "{{$labels.instance}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 20% space left on its filesystem."; + description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 20% space left on its filesystem."; }; # filesystem_inodes_full = { @@ -105,17 +103,14 @@ lib.mapAttrsToList # }; swap_using_20percent = { - condition = - "node_memory_SwapTotal_bytes - (node_memory_SwapCached_bytes + node_memory_SwapFree_bytes) > node_memory_SwapTotal_bytes * 0.2"; + condition = "node_memory_SwapTotal_bytes - (node_memory_SwapCached_bytes + node_memory_SwapFree_bytes) > node_memory_SwapTotal_bytes * 0.2"; time = "30m"; - description = - "{{$labels.instance}} is using 20% of its swap space for at least 30 minutes."; + description = "{{$labels.instance}} is using 20% of its swap space for at least 30 minutes."; }; systemd_service_failed = { condition = ''node_systemd_unit_state{state="failed"} == 1''; - description = - "{{$labels.instance}} failed to (re)start service {{$labels.name}}."; + description = "{{$labels.instance}} failed to (re)start service {{$labels.name}}."; }; restic_backup_too_old = { @@ -134,19 +129,15 @@ lib.mapAttrsToList # }; ram_using_90percent = { - condition = - "node_memory_Buffers_bytes + node_memory_MemFree_bytes + node_memory_Cached_bytes < node_memory_MemTotal_bytes * 0.1"; + condition = "node_memory_Buffers_bytes + node_memory_MemFree_bytes + node_memory_Cached_bytes < node_memory_MemTotal_bytes * 0.1"; time = "1h"; - description = - "{{$labels.instance}} is using at least 90% of its RAM for at least 1 hour."; + description = "{{$labels.instance}} is using at least 90% of its RAM for at least 1 hour."; }; cpu_using_90percent = { - condition = '' - 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) >= 90''; + condition = ''100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) >= 90''; time = "10m"; - description = - "{{$labels.instance}} is running with cpu usage > 90% for at least 10 minutes: {{$value}}"; + description = "{{$labels.instance}} is running with cpu usage > 90% for at least 10 minutes: {{$value}}"; }; reboot = { @@ -156,18 +147,16 @@ lib.mapAttrsToList uptime = { condition = "(time() - node_boot_time_seconds ) / (60*60*24) > 30"; - description = - "Uptime monster: {{$labels.instance}} has been up for more than 30 days."; + description = "Uptime monster: {{$labels.instance}} has been up for more than 30 days."; }; flake_nixpkgs_outdated = { - condition = '' - (time() - flake_input_last_modified{input="nixpkgs"}) / (60*60*24) > 30''; - description = - "Nixpkgs outdated: Nixpkgs on {{$labels.instance}} has not been updated in 30 days"; + condition = ''(time() - flake_input_last_modified{input="nixpkgs"}) / (60*60*24) > 30''; + description = "Nixpkgs outdated: Nixpkgs on {{$labels.instance}} has not been updated in 30 days"; }; - /* ping = { + /* + ping = { condition = "ping_result_code{type!='mobile'} != 0"; description = "{{$labels.url}}: ping from {{$labels.instance}} has failed!"; }; @@ -178,12 +167,11 @@ lib.mapAttrsToList }; */ http_status = { - condition = '' - probe_http_status_code{instance!~"https://megaclan3000.de"} != 200''; - description = - "http request failed from {{$labels.instance}}: {{$labels.result}}!"; + condition = ''probe_http_status_code{instance!~"https://megaclan3000.de"} != 200''; + description = "http request failed from {{$labels.instance}}: {{$labels.result}}!"; }; - /* http_match_failed = { + /* + http_match_failed = { condition = "http_response_response_string_match == 0"; description = "{{$labels.server}} : http body not as expected; status code: {{$labels.status_code}}!"; }; @@ -206,8 +194,7 @@ lib.mapAttrsToList */ cert_expiry = { condition = "(probe_ssl_earliest_cert_expiry - time())/(3600*24) < 30"; - description = - "{{$labels.instance}}: The TLS certificate will expire in less than 30 days: {{$value}}s"; + description = "{{$labels.instance}}: The TLS certificate will expire in less than 30 days: {{$value}}s"; }; # ignore devices that disabled S.M.A.R.T (example if attached via USB) @@ -223,7 +210,8 @@ lib.mapAttrsToList description = "{{$labels.instance}}: OOM kill detected"; }; - /* unusual_disk_read_latency = { + /* + unusual_disk_read_latency = { condition = "rate(diskio_read_time[1m]) / rate(diskio_reads[1m]) > 0.1 and rate(diskio_reads[1m]) > 0"; description = '' @@ -242,8 +230,7 @@ lib.mapAttrsToList host_memory_under_memory_pressure = { condition = "rate(node_vmstat_pgmajfault[1m]) > 1000"; - description = - "{{$labels.instance}}: The node is under heavy memory pressure. High rate of major page faults: {{$value}}"; + description = "{{$labels.instance}}: The node is under heavy memory pressure. High rate of major page faults: {{$value}}"; }; # ext4_errors = { diff --git a/modules/prometheus/default.nix b/modules/prometheus/default.nix index a402f35d..04a9e384 100644 --- a/modules/prometheus/default.nix +++ b/modules/prometheus/default.nix @@ -84,15 +84,19 @@ ]; ruleFiles = [ - (pkgs.writeText "prometheus-rules.yml" (builtins.toJSON { - groups = [{ - name = "alerting-rules"; - rules = import ./alert-rules.nix { inherit lib; }; - }]; - })) + (pkgs.writeText "prometheus-rules.yml" ( + builtins.toJSON { + groups = [ + { + name = "alerting-rules"; + rules = import ./alert-rules.nix { inherit lib; }; + } + ]; + } + )) ]; - alertmanagers = [{ static_configs = [{ targets = [ "localhost:9093" ]; }]; }]; + alertmanagers = [ { static_configs = [ { targets = [ "localhost:9093" ]; } ]; } ]; alertmanager = { enable = true; @@ -109,22 +113,26 @@ repeat_interval = "24h"; }; - receivers = [{ - name = "all"; - # Email config documentation: https://prometheus.io/docs/alerting/latest/configuration/#email_config - email_configs = [{ - send_resolved = true; - to = "TODO"; - from = "alerts@pub.solar"; - smarthost = "TODO"; - auth_username = "TODO"; - auth_password_file = "${config.age.secrets.nachtigall-alertmanager-smtp-password.path}"; - require_tls = true; - }]; - # TODO: - # For matrix notifications, look into: https://github.com/pinpox/matrix-hook and add a webhook - # webhook_configs = [ { url = "http://127.0.0.1:11000/alert"; } ]; - }]; + receivers = [ + { + name = "all"; + # Email config documentation: https://prometheus.io/docs/alerting/latest/configuration/#email_config + email_configs = [ + { + send_resolved = true; + to = "TODO"; + from = "alerts@pub.solar"; + smarthost = "TODO"; + auth_username = "TODO"; + auth_password_file = "${config.age.secrets.nachtigall-alertmanager-smtp-password.path}"; + require_tls = true; + } + ]; + # TODO: + # For matrix notifications, look into: https://github.com/pinpox/matrix-hook and add a webhook + # webhook_configs = [ { url = "http://127.0.0.1:11000/alert"; } ]; + } + ]; }; }; };