diff --git a/modules/grafana/default.nix b/modules/grafana/default.nix index e8fa7181..be124743 100644 --- a/modules/grafana/default.nix +++ b/modules/grafana/default.nix @@ -15,6 +15,7 @@ file = "${flake.self}/secrets/grafana-smtp-password.age"; mode = "644"; owner = "grafana"; + group = "prometheus"; }; age.secrets.grafana-keycloak-client-secret = { file = "${flake.self}/secrets/grafana-keycloak-client-secret.age"; diff --git a/modules/prometheus/alert-rules.nix b/modules/prometheus/alert-rules.nix index 23f66ab5..fb832a0b 100644 --- a/modules/prometheus/alert-rules.nix +++ b/modules/prometheus/alert-rules.nix @@ -51,12 +51,18 @@ lib.mapAttrsToList # description = ''{{ $labels.instance }} {{ $labels.job }} {{ $labels.path }} has been lagging by more than 1MB for more than 15m.''; # }; - filesystem_full_80percent = { + filesystem_root_full_80percent = { condition = ''100 - ((node_filesystem_avail_bytes{fstype!="rootfs",mountpoint="/"} * 100) / node_filesystem_size_bytes{fstype!="rootfs",mountpoint="/"}) > 80''; time = "10m"; description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 20% space left on its filesystem."; }; + filesystem_data_full_80percent = { + condition = ''100 - ((node_filesystem_avail_bytes{fstype!="rootfs",mountpoint="/var/lib"} * 100) / node_filesystem_size_bytes{fstype!="rootfs",mountpoint="/var/lib"}) > 80''; + time = "10m"; + description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 20% space left on its filesystem."; + }; + # filesystem_inodes_full = { # condition = ''disk_inodes_free / disk_inodes_total < 0.10''; # time = "10m"; @@ -102,11 +108,11 @@ lib.mapAttrsToList # "homeassistant notification {{$labels.entity}} ({{$labels.friendly_name}}): {{$value}}"; # }; - swap_using_20percent = { - condition = "node_memory_SwapTotal_bytes - (node_memory_SwapCached_bytes + node_memory_SwapFree_bytes) > node_memory_SwapTotal_bytes * 0.2"; - time = "30m"; - description = "{{$labels.instance}} is using 20% of its swap space for at least 30 minutes."; - }; + #swap_using_20percent = { + # condition = "node_memory_SwapTotal_bytes - (node_memory_SwapCached_bytes + node_memory_SwapFree_bytes) > node_memory_SwapTotal_bytes * 0.2"; + # time = "30m"; + # description = "{{$labels.instance}} is using 20% of its swap space for at least 30 minutes."; + #}; systemd_service_failed = { condition = ''node_systemd_unit_state{state="failed"} == 1''; @@ -118,10 +124,10 @@ lib.mapAttrsToList description = "{{$labels.instance}} not backed up for more than 24 hours. ({{$value}})"; }; - host_down = { - condition = ''up{job="node-stats", instance!~"ahorn.wireguard:9100|kartoffel.wireguard:9100|mega.wireguard:9100"} == 0''; - description = "{{$labels.instance}} is down!"; - }; + #host_down = { + # condition = ''up{job="node-stats", instance!~"ahorn.wireguard:9100|kartoffel.wireguard:9100|mega.wireguard:9100"} == 0''; + # description = "{{$labels.instance}} is down!"; + #}; # service_not_running = { # condition = ''systemd_units_active_code{name=~"teamspeak3-server.service|tt-rss.service", sub!="running"}''; @@ -166,10 +172,10 @@ lib.mapAttrsToList description = "{{$labels.instance}}: ping probe from {{$labels.source}} is encountering high latency!"; }; */ - http_status = { - condition = ''probe_http_status_code{instance!~"https://megaclan3000.de"} != 200''; - description = "http request failed from {{$labels.instance}}: {{$labels.result}}!"; - }; + #http_status = { + # condition = ''probe_http_status_code{instance!~"https://pub.solar"} != 200''; + # description = "http request failed from {{$labels.instance}}: {{$labels.result}}!"; + #}; /* http_match_failed = { condition = "http_response_response_string_match == 0"; @@ -192,10 +198,10 @@ lib.mapAttrsToList description = "{{$labels.instance}}: healtcheck {{$labels.job}} fails!"; }; */ - cert_expiry = { - condition = "(probe_ssl_earliest_cert_expiry - time())/(3600*24) < 30"; - description = "{{$labels.instance}}: The TLS certificate will expire in less than 30 days: {{$value}}s"; - }; + #cert_expiry = { + # condition = "(probe_ssl_earliest_cert_expiry - time())/(3600*24) < 30"; + # description = "{{$labels.instance}}: The TLS certificate will expire in less than 30 days: {{$value}}s"; + #}; # ignore devices that disabled S.M.A.R.T (example if attached via USB) diff --git a/modules/prometheus/default.nix b/modules/prometheus/default.nix index 04a9e384..c58c48b2 100644 --- a/modules/prometheus/default.nix +++ b/modules/prometheus/default.nix @@ -17,6 +17,8 @@ output discard ''; extraConfig = '' + bind 10.7.6.2 fd00:fae:fae:fae:fae:2:: + tls internal reverse_proxy :${toString config.services.prometheus.alertmanager.port} ''; }; @@ -101,7 +103,7 @@ alertmanager = { enable = true; # port = 9093; # Default - webExternalUrl = "https://alerts.pub.solar"; # TODO use a proper url? + webExternalUrl = "https://alerts.pub.solar"; # environmentFile = "${config.age.secrets.nachtigall-alertmanager-envfile.path}"; configuration = { @@ -120,11 +122,11 @@ email_configs = [ { send_resolved = true; - to = "TODO"; + to = "admins@pub.solar"; from = "alerts@pub.solar"; - smarthost = "TODO"; - auth_username = "TODO"; - auth_password_file = "${config.age.secrets.nachtigall-alertmanager-smtp-password.path}"; + smarthost = "mail.greenbaum.zone:465"; + auth_username = "admins@pub.solar"; + auth_password_file = "${config.age.secrets.grafana-smtp-password.path}"; require_tls = true; } ]; diff --git a/terraform/dns.tf b/terraform/dns.tf index a35978cf..039dd868 100644 --- a/terraform/dns.tf +++ b/terraform/dns.tf @@ -21,8 +21,8 @@ resource "namecheap_domain_records" "pub-solar" { } record { hostname = "alerts" - type = "CNAME" - address = "flora-6.pub.solar." + type = "A" + address = "10.7.6.2" } record { hostname = "git"