From 7e2bcfc5cf168340f109f19347cd5307a5d0a7df Mon Sep 17 00:00:00 2001 From: Pablo Ovelleiro Corral Date: Sat, 27 Apr 2024 00:08:23 +0200 Subject: [PATCH 1/9] Add alertmanager config --- modules/prometheus/alert-rules.nix | 260 +++++++++++++++++++++++++++++ modules/prometheus/default.nix | 45 +++++ 2 files changed, 305 insertions(+) create mode 100644 modules/prometheus/alert-rules.nix diff --git a/modules/prometheus/alert-rules.nix b/modules/prometheus/alert-rules.nix new file mode 100644 index 0000000..4561388 --- /dev/null +++ b/modules/prometheus/alert-rules.nix @@ -0,0 +1,260 @@ +{ lib }: + +let + # docker's filesystems disappear quickly, leading to false positives + deviceFilter = ''path!~"^(/var/lib/docker|/nix/store).*"''; +in +lib.mapAttrsToList + (name: opts: { + alert = name; + expr = opts.condition; + for = opts.time or "2m"; + labels = { }; + annotations.description = opts.description; + }) + ({ + + # prometheus_too_many_restarts = { + # condition = ''changes(process_start_time_seconds{job=~"prometheus|alertmanager"}[15m]) > 2''; + # description = "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping."; + # }; + + # alert_manager_config_not_synced = { + # condition = ''count(count_values("config_hash", alertmanager_config_hash)) > 1''; + # description = "Configurations of AlertManager cluster instances are out of sync."; + # }; + + #alert_manager_e2e_dead_man_switch = { + # condition = "vector(1)"; + # description = "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager."; + #}; + + # prometheus_not_connected_to_alertmanager = { + # condition = "prometheus_notifications_alertmanagers_discovered < 1"; + # description = "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"; + # }; + + # prometheus_rule_evaluation_failures = { + # condition = "increase(prometheus_rule_evaluation_failures_total[3m]) > 0"; + # description = "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"; + # }; + + # prometheus_template_expansion_failures = { + # condition = "increase(prometheus_template_text_expansion_failures_total[3m]) > 0"; + # time = "0m"; + # description = "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"; + # }; + + # promtail_file_lagging = { + # condition = ''abs(promtail_file_bytes_total - promtail_read_bytes_total) > 1e6''; + # time = "15m"; + # description = ''{{ $labels.instance }} {{ $labels.job }} {{ $labels.path }} has been lagging by more than 1MB for more than 15m.''; + # }; + + filesystem_full_80percent = { + condition = '' + 100 - ((node_filesystem_avail_bytes{fstype!="rootfs",mountpoint="/"} * 100) / node_filesystem_size_bytes{fstype!="rootfs",mountpoint="/"}) > 80''; + time = "10m"; + description = + "{{$labels.instance}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 20% space left on its filesystem."; + }; + + # filesystem_inodes_full = { + # condition = ''disk_inodes_free / disk_inodes_total < 0.10''; + # time = "10m"; + # description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 10% inodes left on its filesystem."; + # }; + + # daily_task_not_run = { + # # give 6 hours grace period + # condition = ''time() - task_last_run{state="ok",frequency="daily"} > (24 + 6) * 60 * 60''; + # description = "{{$labels.instance}}: {{$labels.name}} was not run in the last 24h"; + # }; + + # daily_task_failed = { + # condition = ''task_last_run{state="fail"}''; + # description = "{{$labels.instance}}: {{$labels.name}} failed to run"; + # }; + # } // (lib.genAttrs [ + # "borgbackup-turingmachine" + # "borgbackup-eve" + # "borgbackup-datastore" + # ] + # (name: { + # condition = ''absent_over_time(task_last_run{name="${name}"}[1d])''; + # description = "status of ${name} is unknown: no data for a day"; + # })) + # // { + + # borgbackup_matchbox_not_run = { + # # give 6 hours grace period + # condition = ''time() - task_last_run{state="ok",frequency="daily",name="borgbackup-matchbox"} > 7 * 24 * 60 * 60''; + # description = "{{$labels.instance}}: {{$labels.name}} was not run in the last week"; + # }; + + # borgbackup_matchbox = { + # condition = ''absent_over_time(task_last_run{name="borgbackup-matchbox"}[7d])''; + # description = "status of borgbackup-matchbox is unknown: no data for a week"; + # }; + + # homeassistant = { + # condition = '' + # homeassistant_entity_available{domain="persistent_notification", entity!="persistent_notification.http_login"} >= 0''; + # description = + # "homeassistant notification {{$labels.entity}} ({{$labels.friendly_name}}): {{$value}}"; + # }; + + swap_using_20percent = { + condition = + "node_memory_SwapTotal_bytes - (node_memory_SwapCached_bytes + node_memory_SwapFree_bytes) > node_memory_SwapTotal_bytes * 0.2"; + time = "30m"; + description = + "{{$labels.instance}} is using 20% of its swap space for at least 30 minutes."; + }; + + systemd_service_failed = { + condition = ''node_systemd_unit_state{state="failed"} == 1''; + description = + "{{$labels.instance}} failed to (re)start service {{$labels.name}}."; + }; + + restic_backup_too_old = { + condition = ''(time() - restic_snapshots_latest_time)/(60*60) > 24''; + description = "{{$labels.instance}} not backed up for more than 24 hours. ({{$value}})"; + }; + + host_down = { + condition = ''up{job="node-stats", instance!~"ahorn.wireguard:9100|kartoffel.wireguard:9100|mega.wireguard:9100"} == 0''; + description = "{{$labels.instance}} is down!"; + }; + + # service_not_running = { + # condition = ''systemd_units_active_code{name=~"teamspeak3-server.service|tt-rss.service", sub!="running"}''; + # description = "{{$labels.instance}} should have a running {{$labels.name}}."; + # }; + + ram_using_90percent = { + condition = + "node_memory_Buffers_bytes + node_memory_MemFree_bytes + node_memory_Cached_bytes < node_memory_MemTotal_bytes * 0.1"; + time = "1h"; + description = + "{{$labels.instance}} is using at least 90% of its RAM for at least 1 hour."; + }; + + cpu_using_90percent = { + condition = '' + 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) >= 90''; + time = "10m"; + description = + "{{$labels.instance}} is running with cpu usage > 90% for at least 10 minutes: {{$value}}"; + }; + + reboot = { + condition = "node_boot_time_seconds < 300"; + description = "{{$labels.instance}} just rebooted."; + }; + + uptime = { + condition = "(time() - node_boot_time_seconds ) / (60*60*24) > 30"; + description = + "Uptime monster: {{$labels.instance}} has been up for more than 30 days."; + }; + + flake_nixpkgs_outdated = { + condition = '' + (time() - flake_input_last_modified{input="nixpkgs"}) / (60*60*24) > 30''; + description = + "Nixpkgs outdated: Nixpkgs on {{$labels.instance}} has not been updated in 30 days"; + }; + + /* ping = { + condition = "ping_result_code{type!='mobile'} != 0"; + description = "{{$labels.url}}: ping from {{$labels.instance}} has failed!"; + }; + + ping_high_latency = { + condition = "ping_average_response_ms{type!='mobile'} > 5000"; + description = "{{$labels.instance}}: ping probe from {{$labels.source}} is encountering high latency!"; + }; + */ + http_status = { + condition = '' + probe_http_status_code{instance!~"https://megaclan3000.de"} != 200''; + description = + "http request failed from {{$labels.instance}}: {{$labels.result}}!"; + }; + /* http_match_failed = { + condition = "http_response_response_string_match == 0"; + description = "{{$labels.server}} : http body not as expected; status code: {{$labels.status_code}}!"; + }; + dns_query = { + condition = "dns_query_result_code != 0"; + description = "{{$labels.domain}} : could retrieve A record {{$labels.instance}} from server {{$labels.server}}: {{$labels.result}}!"; + }; + secure_dns_query = { + condition = "secure_dns_state != 0"; + description = "{{$labels.domain}} : could retrieve A record {{$labels.instance}} from server {{$labels.server}}: {{$labels.result}} for protocol {{$labels.protocol}}!"; + }; + connection_failed = { + condition = "net_response_result_code != 0"; + description = "{{$labels.server}}: connection to {{$labels.port}}({{$labels.protocol}}) failed from {{$labels.instance}}"; + }; + healthchecks = { + condition = "hc_check_up == 0"; + description = "{{$labels.instance}}: healtcheck {{$labels.job}} fails!"; + }; + */ + cert_expiry = { + condition = "(probe_ssl_earliest_cert_expiry - time())/(3600*24) < 30"; + description = + "{{$labels.instance}}: The TLS certificate will expire in less than 30 days: {{$value}}s"; + }; + + # ignore devices that disabled S.M.A.R.T (example if attached via USB) + + # smart_errors = { + # condition = ''smart_device_health_ok{enabled!="Disabled"} != 1''; + # description = + # "{{$labels.instance}}: S.M.A.R.T reports: {{$labels.device}} ({{$labels.model}}) has errors."; + # }; + + oom_kills = { + condition = "increase(node_vmstat_oom_kill[5m]) > 0"; + description = "{{$labels.instance}}: OOM kill detected"; + }; + + /* unusual_disk_read_latency = { + condition = + "rate(diskio_read_time[1m]) / rate(diskio_reads[1m]) > 0.1 and rate(diskio_reads[1m]) > 0"; + description = '' + {{$labels.instance}}: Disk latency is growing (read operations > 100ms) + ''; + }; + + unusual_disk_write_latency = { + condition = + "rate(diskio_write_time[1m]) / rate(diskio_write[1m]) > 0.1 and rate(diskio_write[1m]) > 0"; + description = '' + {{$labels.instance}}: Disk latency is growing (write operations > 100ms) + ''; + }; + */ + + host_memory_under_memory_pressure = { + condition = "rate(node_vmstat_pgmajfault[1m]) > 1000"; + description = + "{{$labels.instance}}: The node is under heavy memory pressure. High rate of major page faults: {{$value}}"; + }; + + # ext4_errors = { + # condition = "ext4_errors_value > 0"; + # description = + # "{{$labels.instance}}: ext4 has reported {{$value}} I/O errors: check /sys/fs/ext4/*/errors_count"; + # }; + + # alerts_silences_changed = { + # condition = ''abs(delta(alertmanager_silences{state="active"}[1h])) >= 1''; + # description = + # "alertmanager: number of active silences has changed: {{$value}}"; + # }; + }) diff --git a/modules/prometheus/default.nix b/modules/prometheus/default.nix index de5d88b..1cad3a3 100644 --- a/modules/prometheus/default.nix +++ b/modules/prometheus/default.nix @@ -73,5 +73,50 @@ ]; } ]; + + ruleFiles = [ + (pkgs.writeText "prometheus-rules.yml" (builtins.toJSON { + groups = [{ + name = "alerting-rules"; + rules = import ./alert-rules.nix { inherit lib; }; + }]; + })) + ]; + + alertmanagers = [{ static_configs = [{ targets = [ "localhost:9093" ]; }]; }]; + + alertmanager = { + enable = true; + # port = 9093; # Default + webExternalUrl = "https://alerts.pub.solar"; # TODO use a proper url? + # environmentFile = "${config.age.secrets.nachtigall-alertmanager-envfile.path}"; + configuration = { + + route = { + receiver = "all"; + group_by = [ "instance" ]; + group_wait = "30s"; + group_interval = "2m"; + repeat_interval = "24h"; + }; + + receivers = [{ + name = "all"; + # Email config documentation: https://prometheus.io/docs/alerting/latest/configuration/#email_config + email_configs = [{ + send_resolved = true; + to = "TODO"; + from = "alerts@pub.solar"; + smarthost = "TODO"; + auth_username = "TODO"; + auth_password_file = "${config.age.secrets.nachtigall-alertmanager-smtp-password.path}"; + require_tls = true; + }]; + # TODO: + # For matrix notifications, look into: https://github.com/pinpox/matrix-hook and add a webhook + # webhook_configs = [ { url = "http://127.0.0.1:11000/alert"; } ]; + }]; + }; + }; }; } From 8b7f54727683ab11ce53376dcb1e31f8fc0c6220 Mon Sep 17 00:00:00 2001 From: Pablo Ovelleiro Corral Date: Sat, 27 Apr 2024 00:23:43 +0200 Subject: [PATCH 2/9] Add dns entry --- terraform/dns.tf | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/terraform/dns.tf b/terraform/dns.tf index 15e47c0..d805798 100644 --- a/terraform/dns.tf +++ b/terraform/dns.tf @@ -19,6 +19,11 @@ resource "namecheap_domain_records" "pub-solar" { type = "A" address = "80.71.153.210" } + record { + hostname = "alerts" + type = "CNAME" + address = "flora-6.pub.solar." + } record { hostname = "git" type = "CNAME" From 2679b897a0c3819f8ee35c2b1207cef4416e2f4d Mon Sep 17 00:00:00 2001 From: Pablo Ovelleiro Corral Date: Sat, 27 Apr 2024 00:26:52 +0200 Subject: [PATCH 3/9] Autoformat dns.tf --- terraform/dns.tf | 146 +++++++++++++++++++++++------------------------ 1 file changed, 73 insertions(+), 73 deletions(-) diff --git a/terraform/dns.tf b/terraform/dns.tf index d805798..a35978c 100644 --- a/terraform/dns.tf +++ b/terraform/dns.tf @@ -1,23 +1,23 @@ # https://registry.terraform.io/providers/namecheap/namecheap/latest/docs resource "namecheap_domain_records" "pub-solar" { - domain = "pub.solar" - mode = "OVERWRITE" + domain = "pub.solar" + mode = "OVERWRITE" email_type = "MX" record { hostname = "flora-6" - type = "A" - address = "80.71.153.210" + type = "A" + address = "80.71.153.210" } record { hostname = "auth" - type = "CNAME" - address = "nachtigall.pub.solar." + type = "CNAME" + address = "nachtigall.pub.solar." } record { hostname = "ci" - type = "A" - address = "80.71.153.210" + type = "A" + address = "80.71.153.210" } record { hostname = "alerts" @@ -26,84 +26,84 @@ resource "namecheap_domain_records" "pub-solar" { } record { hostname = "git" - type = "CNAME" - address = "nachtigall.pub.solar." + type = "CNAME" + address = "nachtigall.pub.solar." } record { hostname = "stream" - type = "CNAME" - address = "nachtigall.pub.solar." + type = "CNAME" + address = "nachtigall.pub.solar." } record { hostname = "list" - type = "CNAME" - address = "nachtigall.pub.solar." + type = "CNAME" + address = "nachtigall.pub.solar." } record { hostname = "obs-portal" - type = "CNAME" - address = "nachtigall.pub.solar." + type = "CNAME" + address = "nachtigall.pub.solar." } record { hostname = "vpn" - type = "A" - address = "80.71.153.210" + type = "A" + address = "80.71.153.210" } record { hostname = "cache" - type = "A" - address = "95.217.225.160" + type = "A" + address = "95.217.225.160" } record { hostname = "factorio" - type = "A" - address = "80.244.242.2" + type = "A" + address = "80.244.242.2" } record { hostname = "collabora" - type = "CNAME" - address = "nachtigall.pub.solar." + type = "CNAME" + address = "nachtigall.pub.solar." } record { hostname = "@" - type = "ALIAS" - address = "nachtigall.pub.solar." - ttl = 300 + type = "ALIAS" + address = "nachtigall.pub.solar." + ttl = 300 } record { hostname = "chat" - type = "CNAME" - address = "nachtigall.pub.solar." + type = "CNAME" + address = "nachtigall.pub.solar." } record { hostname = "cloud" - type = "CNAME" - address = "nachtigall.pub.solar." + type = "CNAME" + address = "nachtigall.pub.solar." } record { hostname = "turn" - type = "CNAME" - address = "nachtigall.pub.solar." + type = "CNAME" + address = "nachtigall.pub.solar." } record { hostname = "grafana" - type = "A" - address = "80.71.153.210" + type = "A" + address = "80.71.153.210" } record { hostname = "hpb" - type = "A" - address = "80.71.153.239" + type = "A" + address = "80.71.153.239" } record { hostname = "files" - type = "CNAME" - address = "nachtigall.pub.solar." + type = "CNAME" + address = "nachtigall.pub.solar." } record { hostname = "search" - type = "CNAME" - address = "nachtigall.pub.solar." + type = "CNAME" + address = "nachtigall.pub.solar." } record { hostname = "stickers.chat" @@ -112,80 +112,80 @@ resource "namecheap_domain_records" "pub-solar" { } record { hostname = "wiki" - type = "CNAME" - address = "nachtigall.pub.solar." + type = "CNAME" + address = "nachtigall.pub.solar." } record { hostname = "mastodon" - type = "CNAME" - address = "nachtigall.pub.solar." + type = "CNAME" + address = "nachtigall.pub.solar." } record { hostname = "matrix" - type = "CNAME" - address = "nachtigall.pub.solar." + type = "CNAME" + address = "nachtigall.pub.solar." } record { hostname = "tmate" - type = "CNAME" - address = "nachtigall.pub.solar." + type = "CNAME" + address = "nachtigall.pub.solar." } record { hostname = "www" - type = "CNAME" - address = "nachtigall.pub.solar." + type = "CNAME" + address = "nachtigall.pub.solar." } record { hostname = "@" - type = "TXT" - address = "v=spf1 include:spf.greenbaum.zone a:list.pub.solar ~all" + type = "TXT" + address = "v=spf1 include:spf.greenbaum.zone a:list.pub.solar ~all" } record { hostname = "list" - type = "TXT" - address = "v=spf1 a:list.pub.solar ?all" + type = "TXT" + address = "v=spf1 a:list.pub.solar ?all" } record { hostname = "_dmarc" - type = "TXT" - address = "v=DMARC1; p=reject;" + type = "TXT" + address = "v=DMARC1; p=reject;" } record { hostname = "_dmarc.list" - type = "TXT" - address = "v=DMARC1; p=reject;" + type = "TXT" + address = "v=DMARC1; p=reject;" } record { hostname = "modoboa._domainkey" - type = "TXT" - address = "v=DKIM1;k=rsa;p=MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAx/EqLMpk0MyL1aQ0JVG44ypTRbZBVA13MFjEntxAvowaWtq1smRbnEwTTKgqUOrUyaM4dVmli1dedne4mk/ncqRAm02KuhtTY+5wXfhTKK53EhqehbKwH+Qvzb12983Qwdau/QTHiFHwXHufMaSsCvd9CRWCp9q68Q7noQqndJeLHT6L0eECd2Zk3ZxJuh+Fxdb7+Kw68Tf6z13Rs+MU01qLM7x0jmSQHa4cv2pk+7NTGMBRp6fVskfbqev5nFkZWJ7rhXEbP9Eukd/L3ro/ubs1quWJotG02gPRKE8fgkm1Ytlws1/pnqpuvKXQS1HzBEP1X2ExezJMzQ1SnZCigQIDAQAB" + type = "TXT" + address = "v=DKIM1;k=rsa;p=MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAx/EqLMpk0MyL1aQ0JVG44ypTRbZBVA13MFjEntxAvowaWtq1smRbnEwTTKgqUOrUyaM4dVmli1dedne4mk/ncqRAm02KuhtTY+5wXfhTKK53EhqehbKwH+Qvzb12983Qwdau/QTHiFHwXHufMaSsCvd9CRWCp9q68Q7noQqndJeLHT6L0eECd2Zk3ZxJuh+Fxdb7+Kw68Tf6z13Rs+MU01qLM7x0jmSQHa4cv2pk+7NTGMBRp6fVskfbqev5nFkZWJ7rhXEbP9Eukd/L3ro/ubs1quWJotG02gPRKE8fgkm1Ytlws1/pnqpuvKXQS1HzBEP1X2ExezJMzQ1SnZCigQIDAQAB" } record { hostname = "@" - type = "MX" - address = "mail.greenbaum.zone." - mx_pref = "0" + type = "MX" + address = "mail.greenbaum.zone." + mx_pref = "0" } record { hostname = "list" - type = "MX" - address = "list.pub.solar." - mx_pref = "0" + type = "MX" + address = "list.pub.solar." + mx_pref = "0" } record { hostname = "nachtigall" - type = "A" - address = "138.201.80.102" + type = "A" + address = "138.201.80.102" } record { hostname = "nachtigall" - type = "AAAA" - address = "2a01:4f8:172:1c25::1" + type = "AAAA" + address = "2a01:4f8:172:1c25::1" } record { hostname = "matrix.test" - type = "CNAME" - address = "nachtigall.pub.solar." + type = "CNAME" + address = "nachtigall.pub.solar." } # SRV records can only be changed via NameCheap Web UI # add comment From 11f5557a7a80750608926e58ab92db50151faec9 Mon Sep 17 00:00:00 2001 From: Pablo Ovelleiro Corral Date: Sat, 27 Apr 2024 01:37:03 +0200 Subject: [PATCH 4/9] Add reverseproxy for alerts.pub.solar Co-authored-by: teutat3s --- modules/prometheus/default.nix | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/modules/prometheus/default.nix b/modules/prometheus/default.nix index 1cad3a3..a402f35 100644 --- a/modules/prometheus/default.nix +++ b/modules/prometheus/default.nix @@ -12,6 +12,15 @@ owner = "prometheus"; }; + services.caddy.virtualHosts."prometheus.${config.pub-solar-os.networking.domain}" = { + logFormat = lib.mkForce '' + output discard + ''; + extraConfig = '' + reverse_proxy :${toString config.services.prometheus.alertmanager.port} + ''; + }; + services.prometheus = { enable = true; port = 9001; From a8a8155114bbe1a848754b843219401d88147d30 Mon Sep 17 00:00:00 2001 From: teutat3s Date: Sun, 12 May 2024 21:17:49 +0200 Subject: [PATCH 5/9] style: treefmt with nixfmt-rfc-style --- modules/prometheus/alert-rules.nix | 57 ++++++++++++------------------ modules/prometheus/default.nix | 54 ++++++++++++++++------------ 2 files changed, 53 insertions(+), 58 deletions(-) diff --git a/modules/prometheus/alert-rules.nix b/modules/prometheus/alert-rules.nix index 4561388..23f66ab 100644 --- a/modules/prometheus/alert-rules.nix +++ b/modules/prometheus/alert-rules.nix @@ -52,11 +52,9 @@ lib.mapAttrsToList # }; filesystem_full_80percent = { - condition = '' - 100 - ((node_filesystem_avail_bytes{fstype!="rootfs",mountpoint="/"} * 100) / node_filesystem_size_bytes{fstype!="rootfs",mountpoint="/"}) > 80''; + condition = ''100 - ((node_filesystem_avail_bytes{fstype!="rootfs",mountpoint="/"} * 100) / node_filesystem_size_bytes{fstype!="rootfs",mountpoint="/"}) > 80''; time = "10m"; - description = - "{{$labels.instance}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 20% space left on its filesystem."; + description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 20% space left on its filesystem."; }; # filesystem_inodes_full = { @@ -105,17 +103,14 @@ lib.mapAttrsToList # }; swap_using_20percent = { - condition = - "node_memory_SwapTotal_bytes - (node_memory_SwapCached_bytes + node_memory_SwapFree_bytes) > node_memory_SwapTotal_bytes * 0.2"; + condition = "node_memory_SwapTotal_bytes - (node_memory_SwapCached_bytes + node_memory_SwapFree_bytes) > node_memory_SwapTotal_bytes * 0.2"; time = "30m"; - description = - "{{$labels.instance}} is using 20% of its swap space for at least 30 minutes."; + description = "{{$labels.instance}} is using 20% of its swap space for at least 30 minutes."; }; systemd_service_failed = { condition = ''node_systemd_unit_state{state="failed"} == 1''; - description = - "{{$labels.instance}} failed to (re)start service {{$labels.name}}."; + description = "{{$labels.instance}} failed to (re)start service {{$labels.name}}."; }; restic_backup_too_old = { @@ -134,19 +129,15 @@ lib.mapAttrsToList # }; ram_using_90percent = { - condition = - "node_memory_Buffers_bytes + node_memory_MemFree_bytes + node_memory_Cached_bytes < node_memory_MemTotal_bytes * 0.1"; + condition = "node_memory_Buffers_bytes + node_memory_MemFree_bytes + node_memory_Cached_bytes < node_memory_MemTotal_bytes * 0.1"; time = "1h"; - description = - "{{$labels.instance}} is using at least 90% of its RAM for at least 1 hour."; + description = "{{$labels.instance}} is using at least 90% of its RAM for at least 1 hour."; }; cpu_using_90percent = { - condition = '' - 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) >= 90''; + condition = ''100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) >= 90''; time = "10m"; - description = - "{{$labels.instance}} is running with cpu usage > 90% for at least 10 minutes: {{$value}}"; + description = "{{$labels.instance}} is running with cpu usage > 90% for at least 10 minutes: {{$value}}"; }; reboot = { @@ -156,18 +147,16 @@ lib.mapAttrsToList uptime = { condition = "(time() - node_boot_time_seconds ) / (60*60*24) > 30"; - description = - "Uptime monster: {{$labels.instance}} has been up for more than 30 days."; + description = "Uptime monster: {{$labels.instance}} has been up for more than 30 days."; }; flake_nixpkgs_outdated = { - condition = '' - (time() - flake_input_last_modified{input="nixpkgs"}) / (60*60*24) > 30''; - description = - "Nixpkgs outdated: Nixpkgs on {{$labels.instance}} has not been updated in 30 days"; + condition = ''(time() - flake_input_last_modified{input="nixpkgs"}) / (60*60*24) > 30''; + description = "Nixpkgs outdated: Nixpkgs on {{$labels.instance}} has not been updated in 30 days"; }; - /* ping = { + /* + ping = { condition = "ping_result_code{type!='mobile'} != 0"; description = "{{$labels.url}}: ping from {{$labels.instance}} has failed!"; }; @@ -178,12 +167,11 @@ lib.mapAttrsToList }; */ http_status = { - condition = '' - probe_http_status_code{instance!~"https://megaclan3000.de"} != 200''; - description = - "http request failed from {{$labels.instance}}: {{$labels.result}}!"; + condition = ''probe_http_status_code{instance!~"https://megaclan3000.de"} != 200''; + description = "http request failed from {{$labels.instance}}: {{$labels.result}}!"; }; - /* http_match_failed = { + /* + http_match_failed = { condition = "http_response_response_string_match == 0"; description = "{{$labels.server}} : http body not as expected; status code: {{$labels.status_code}}!"; }; @@ -206,8 +194,7 @@ lib.mapAttrsToList */ cert_expiry = { condition = "(probe_ssl_earliest_cert_expiry - time())/(3600*24) < 30"; - description = - "{{$labels.instance}}: The TLS certificate will expire in less than 30 days: {{$value}}s"; + description = "{{$labels.instance}}: The TLS certificate will expire in less than 30 days: {{$value}}s"; }; # ignore devices that disabled S.M.A.R.T (example if attached via USB) @@ -223,7 +210,8 @@ lib.mapAttrsToList description = "{{$labels.instance}}: OOM kill detected"; }; - /* unusual_disk_read_latency = { + /* + unusual_disk_read_latency = { condition = "rate(diskio_read_time[1m]) / rate(diskio_reads[1m]) > 0.1 and rate(diskio_reads[1m]) > 0"; description = '' @@ -242,8 +230,7 @@ lib.mapAttrsToList host_memory_under_memory_pressure = { condition = "rate(node_vmstat_pgmajfault[1m]) > 1000"; - description = - "{{$labels.instance}}: The node is under heavy memory pressure. High rate of major page faults: {{$value}}"; + description = "{{$labels.instance}}: The node is under heavy memory pressure. High rate of major page faults: {{$value}}"; }; # ext4_errors = { diff --git a/modules/prometheus/default.nix b/modules/prometheus/default.nix index a402f35..04a9e38 100644 --- a/modules/prometheus/default.nix +++ b/modules/prometheus/default.nix @@ -84,15 +84,19 @@ ]; ruleFiles = [ - (pkgs.writeText "prometheus-rules.yml" (builtins.toJSON { - groups = [{ - name = "alerting-rules"; - rules = import ./alert-rules.nix { inherit lib; }; - }]; - })) + (pkgs.writeText "prometheus-rules.yml" ( + builtins.toJSON { + groups = [ + { + name = "alerting-rules"; + rules = import ./alert-rules.nix { inherit lib; }; + } + ]; + } + )) ]; - alertmanagers = [{ static_configs = [{ targets = [ "localhost:9093" ]; }]; }]; + alertmanagers = [ { static_configs = [ { targets = [ "localhost:9093" ]; } ]; } ]; alertmanager = { enable = true; @@ -109,22 +113,26 @@ repeat_interval = "24h"; }; - receivers = [{ - name = "all"; - # Email config documentation: https://prometheus.io/docs/alerting/latest/configuration/#email_config - email_configs = [{ - send_resolved = true; - to = "TODO"; - from = "alerts@pub.solar"; - smarthost = "TODO"; - auth_username = "TODO"; - auth_password_file = "${config.age.secrets.nachtigall-alertmanager-smtp-password.path}"; - require_tls = true; - }]; - # TODO: - # For matrix notifications, look into: https://github.com/pinpox/matrix-hook and add a webhook - # webhook_configs = [ { url = "http://127.0.0.1:11000/alert"; } ]; - }]; + receivers = [ + { + name = "all"; + # Email config documentation: https://prometheus.io/docs/alerting/latest/configuration/#email_config + email_configs = [ + { + send_resolved = true; + to = "TODO"; + from = "alerts@pub.solar"; + smarthost = "TODO"; + auth_username = "TODO"; + auth_password_file = "${config.age.secrets.nachtigall-alertmanager-smtp-password.path}"; + require_tls = true; + } + ]; + # TODO: + # For matrix notifications, look into: https://github.com/pinpox/matrix-hook and add a webhook + # webhook_configs = [ { url = "http://127.0.0.1:11000/alert"; } ]; + } + ]; }; }; }; From 9245fa679756874af953bf9bef147dfd2703f10a Mon Sep 17 00:00:00 2001 From: teutat3s Date: Sun, 12 May 2024 22:17:58 +0200 Subject: [PATCH 6/9] alertmanager: finalize init --- modules/grafana/default.nix | 1 + modules/prometheus/alert-rules.nix | 42 +++++++++++++++++------------- modules/prometheus/default.nix | 12 +++++---- terraform/dns.tf | 4 +-- 4 files changed, 34 insertions(+), 25 deletions(-) diff --git a/modules/grafana/default.nix b/modules/grafana/default.nix index e8fa718..be12474 100644 --- a/modules/grafana/default.nix +++ b/modules/grafana/default.nix @@ -15,6 +15,7 @@ file = "${flake.self}/secrets/grafana-smtp-password.age"; mode = "644"; owner = "grafana"; + group = "prometheus"; }; age.secrets.grafana-keycloak-client-secret = { file = "${flake.self}/secrets/grafana-keycloak-client-secret.age"; diff --git a/modules/prometheus/alert-rules.nix b/modules/prometheus/alert-rules.nix index 23f66ab..fb832a0 100644 --- a/modules/prometheus/alert-rules.nix +++ b/modules/prometheus/alert-rules.nix @@ -51,12 +51,18 @@ lib.mapAttrsToList # description = ''{{ $labels.instance }} {{ $labels.job }} {{ $labels.path }} has been lagging by more than 1MB for more than 15m.''; # }; - filesystem_full_80percent = { + filesystem_root_full_80percent = { condition = ''100 - ((node_filesystem_avail_bytes{fstype!="rootfs",mountpoint="/"} * 100) / node_filesystem_size_bytes{fstype!="rootfs",mountpoint="/"}) > 80''; time = "10m"; description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 20% space left on its filesystem."; }; + filesystem_data_full_80percent = { + condition = ''100 - ((node_filesystem_avail_bytes{fstype!="rootfs",mountpoint="/var/lib"} * 100) / node_filesystem_size_bytes{fstype!="rootfs",mountpoint="/var/lib"}) > 80''; + time = "10m"; + description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 20% space left on its filesystem."; + }; + # filesystem_inodes_full = { # condition = ''disk_inodes_free / disk_inodes_total < 0.10''; # time = "10m"; @@ -102,11 +108,11 @@ lib.mapAttrsToList # "homeassistant notification {{$labels.entity}} ({{$labels.friendly_name}}): {{$value}}"; # }; - swap_using_20percent = { - condition = "node_memory_SwapTotal_bytes - (node_memory_SwapCached_bytes + node_memory_SwapFree_bytes) > node_memory_SwapTotal_bytes * 0.2"; - time = "30m"; - description = "{{$labels.instance}} is using 20% of its swap space for at least 30 minutes."; - }; + #swap_using_20percent = { + # condition = "node_memory_SwapTotal_bytes - (node_memory_SwapCached_bytes + node_memory_SwapFree_bytes) > node_memory_SwapTotal_bytes * 0.2"; + # time = "30m"; + # description = "{{$labels.instance}} is using 20% of its swap space for at least 30 minutes."; + #}; systemd_service_failed = { condition = ''node_systemd_unit_state{state="failed"} == 1''; @@ -118,10 +124,10 @@ lib.mapAttrsToList description = "{{$labels.instance}} not backed up for more than 24 hours. ({{$value}})"; }; - host_down = { - condition = ''up{job="node-stats", instance!~"ahorn.wireguard:9100|kartoffel.wireguard:9100|mega.wireguard:9100"} == 0''; - description = "{{$labels.instance}} is down!"; - }; + #host_down = { + # condition = ''up{job="node-stats", instance!~"ahorn.wireguard:9100|kartoffel.wireguard:9100|mega.wireguard:9100"} == 0''; + # description = "{{$labels.instance}} is down!"; + #}; # service_not_running = { # condition = ''systemd_units_active_code{name=~"teamspeak3-server.service|tt-rss.service", sub!="running"}''; @@ -166,10 +172,10 @@ lib.mapAttrsToList description = "{{$labels.instance}}: ping probe from {{$labels.source}} is encountering high latency!"; }; */ - http_status = { - condition = ''probe_http_status_code{instance!~"https://megaclan3000.de"} != 200''; - description = "http request failed from {{$labels.instance}}: {{$labels.result}}!"; - }; + #http_status = { + # condition = ''probe_http_status_code{instance!~"https://pub.solar"} != 200''; + # description = "http request failed from {{$labels.instance}}: {{$labels.result}}!"; + #}; /* http_match_failed = { condition = "http_response_response_string_match == 0"; @@ -192,10 +198,10 @@ lib.mapAttrsToList description = "{{$labels.instance}}: healtcheck {{$labels.job}} fails!"; }; */ - cert_expiry = { - condition = "(probe_ssl_earliest_cert_expiry - time())/(3600*24) < 30"; - description = "{{$labels.instance}}: The TLS certificate will expire in less than 30 days: {{$value}}s"; - }; + #cert_expiry = { + # condition = "(probe_ssl_earliest_cert_expiry - time())/(3600*24) < 30"; + # description = "{{$labels.instance}}: The TLS certificate will expire in less than 30 days: {{$value}}s"; + #}; # ignore devices that disabled S.M.A.R.T (example if attached via USB) diff --git a/modules/prometheus/default.nix b/modules/prometheus/default.nix index 04a9e38..c58c48b 100644 --- a/modules/prometheus/default.nix +++ b/modules/prometheus/default.nix @@ -17,6 +17,8 @@ output discard ''; extraConfig = '' + bind 10.7.6.2 fd00:fae:fae:fae:fae:2:: + tls internal reverse_proxy :${toString config.services.prometheus.alertmanager.port} ''; }; @@ -101,7 +103,7 @@ alertmanager = { enable = true; # port = 9093; # Default - webExternalUrl = "https://alerts.pub.solar"; # TODO use a proper url? + webExternalUrl = "https://alerts.pub.solar"; # environmentFile = "${config.age.secrets.nachtigall-alertmanager-envfile.path}"; configuration = { @@ -120,11 +122,11 @@ email_configs = [ { send_resolved = true; - to = "TODO"; + to = "admins@pub.solar"; from = "alerts@pub.solar"; - smarthost = "TODO"; - auth_username = "TODO"; - auth_password_file = "${config.age.secrets.nachtigall-alertmanager-smtp-password.path}"; + smarthost = "mail.greenbaum.zone:465"; + auth_username = "admins@pub.solar"; + auth_password_file = "${config.age.secrets.grafana-smtp-password.path}"; require_tls = true; } ]; diff --git a/terraform/dns.tf b/terraform/dns.tf index a35978c..039dd86 100644 --- a/terraform/dns.tf +++ b/terraform/dns.tf @@ -21,8 +21,8 @@ resource "namecheap_domain_records" "pub-solar" { } record { hostname = "alerts" - type = "CNAME" - address = "flora-6.pub.solar." + type = "A" + address = "10.7.6.2" } record { hostname = "git" From d1a68a7c13e31f4eb337815da64fd69fe2404ecf Mon Sep 17 00:00:00 2001 From: teutat3s Date: Sun, 12 May 2024 22:18:28 +0200 Subject: [PATCH 7/9] secrets: fix too open permissions --- modules/forgejo-actions-runner/default.nix | 2 +- modules/grafana/default.nix | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/forgejo-actions-runner/default.nix b/modules/forgejo-actions-runner/default.nix index 0cd3c8e..2774237 100644 --- a/modules/forgejo-actions-runner/default.nix +++ b/modules/forgejo-actions-runner/default.nix @@ -8,7 +8,7 @@ { age.secrets.forgejo-actions-runner-token = { file = "${flake.self}/secrets/forgejo-actions-runner-token.age"; - mode = "644"; + mode = "440"; }; # Trust docker bridge interface traffic diff --git a/modules/grafana/default.nix b/modules/grafana/default.nix index be12474..624caf3 100644 --- a/modules/grafana/default.nix +++ b/modules/grafana/default.nix @@ -8,18 +8,18 @@ { age.secrets.grafana-admin-password = { file = "${flake.self}/secrets/grafana-admin-password.age"; - mode = "644"; + mode = "440"; owner = "grafana"; }; age.secrets.grafana-smtp-password = { file = "${flake.self}/secrets/grafana-smtp-password.age"; - mode = "644"; + mode = "440"; owner = "grafana"; group = "prometheus"; }; age.secrets.grafana-keycloak-client-secret = { file = "${flake.self}/secrets/grafana-keycloak-client-secret.age"; - mode = "644"; + mode = "440"; owner = "grafana"; }; From bd4241e71d9b38d989041accbcbcb88ad0a9effa Mon Sep 17 00:00:00 2001 From: teutat3s Date: Wed, 15 May 2024 16:17:54 +0200 Subject: [PATCH 8/9] caddy: use alerts.pub.solar domain for vhost --- modules/prometheus/default.nix | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/prometheus/default.nix b/modules/prometheus/default.nix index c58c48b..b1f8fdd 100644 --- a/modules/prometheus/default.nix +++ b/modules/prometheus/default.nix @@ -12,7 +12,7 @@ owner = "prometheus"; }; - services.caddy.virtualHosts."prometheus.${config.pub-solar-os.networking.domain}" = { + services.caddy.virtualHosts."alerts.${config.pub-solar-os.networking.domain}" = { logFormat = lib.mkForce '' output discard ''; From e52324209f14618fd8de2b5f387e996b18cd617b Mon Sep 17 00:00:00 2001 From: teutat3s Date: Wed, 15 May 2024 17:15:46 +0200 Subject: [PATCH 9/9] alertmanager: fix SMTP secret --- modules/grafana/default.nix | 1 - modules/prometheus/default.nix | 11 +++++--- secrets/alertmanager-envfile.age | 43 ++++++++++++++++++++++++++++++++ secrets/secrets.nix | 1 + 4 files changed, 52 insertions(+), 4 deletions(-) create mode 100644 secrets/alertmanager-envfile.age diff --git a/modules/grafana/default.nix b/modules/grafana/default.nix index 624caf3..1080a1d 100644 --- a/modules/grafana/default.nix +++ b/modules/grafana/default.nix @@ -15,7 +15,6 @@ file = "${flake.self}/secrets/grafana-smtp-password.age"; mode = "440"; owner = "grafana"; - group = "prometheus"; }; age.secrets.grafana-keycloak-client-secret = { file = "${flake.self}/secrets/grafana-keycloak-client-secret.age"; diff --git a/modules/prometheus/default.nix b/modules/prometheus/default.nix index b1f8fdd..f77081a 100644 --- a/modules/prometheus/default.nix +++ b/modules/prometheus/default.nix @@ -11,6 +11,11 @@ mode = "600"; owner = "prometheus"; }; + age.secrets.alertmanager-envfile = { + file = "${flake.self}/secrets/alertmanager-envfile.age"; + mode = "600"; + owner = "alertmanager"; + }; services.caddy.virtualHosts."alerts.${config.pub-solar-os.networking.domain}" = { logFormat = lib.mkForce '' @@ -104,7 +109,7 @@ enable = true; # port = 9093; # Default webExternalUrl = "https://alerts.pub.solar"; - # environmentFile = "${config.age.secrets.nachtigall-alertmanager-envfile.path}"; + environmentFile = "${config.age.secrets.alertmanager-envfile.path}"; configuration = { route = { @@ -126,8 +131,8 @@ from = "alerts@pub.solar"; smarthost = "mail.greenbaum.zone:465"; auth_username = "admins@pub.solar"; - auth_password_file = "${config.age.secrets.grafana-smtp-password.path}"; - require_tls = true; + auth_password = "$SMTP_AUTH_PASSWORD"; + require_tls = false; } ]; # TODO: diff --git a/secrets/alertmanager-envfile.age b/secrets/alertmanager-envfile.age new file mode 100644 index 0000000..17191dc --- /dev/null +++ b/secrets/alertmanager-envfile.age @@ -0,0 +1,43 @@ +age-encryption.org/v1 +-> ssh-ed25519 Y0ZZaw TsTaRLA+9WtN9+FJWpXeP12Af5EXMbo+ANTaLC9YlC8 +Yols084RY1C9gfOrDMwJcFRuGZ/5dgGuJey7RXqm7g0 +-> ssh-ed25519 uYcDNw ZLAINtv10PGMtK5TL5Tf0NyK/r1iww+vTC09ElMGoX0 +EgBB3aiHHdaDue9+Zdxg6mTV2VHeLoDN9wT+hlAzVMk +-> ssh-rsa f5THog +aiJqMs3/u06tzs8lx2ISlQm87TDatqEn47v3LB3HehPanRpZx9O1HUIRTeiWkMU9 +XroGe27HQCCPd63QunBHUH7WStA10IS4rHVpMcULB5IM4jwcbOhSYSiGyY2sbv8+ +Nn/04ZOwrfzTabC7moV1DqAw6hnlDqKWp/q5N6xMb780w5vn6Poni3OJfuLaBWaT +r6WhE5evVt3F4jyYI64fB2hFw4AR2N/zIMOMvBncLFwJf9lbIFdbsENZf94cYceF +Tj150xdMPuErBsSJQOlfDYSmyioNN3UJUWiYsDeM3nbPEVPHhfTk6b2/lMhSQkcY +KcuMj/mN/7w7i4HSxW6mUcK2sUMV1BcSSGYRH9ZFf7kq++KpyiP7vB8vaZkcKbfJ +qqrIcXTuXhR+/bWZWqf/GQOVwRwe1TnqN5MoZHipg3a/UCe0gMM617VwZcfhBzjA +eW6VUdjSewwA8YHEuDrAeoQ4CMs7y56EaIlr2IlQy6uzJPX9eeO0auO9RZ5AR40a +7un0FrlTJX9uorpCD/zi3tvd22W5qVoMGZ8vXJShZmT9he9K3Bv6XbzG4DJQ9/nv +xZ676HUYhWeyYZFBvt6DnEBneiDJFeaV2AeuQY+juHBOfBrbYmlE0S4Pd8uRSJ7w +u5UJTT+RV5TkZhpCqqYm7DphYocnrv7Ic+QKmvKE4ls +-> ssh-rsa kFDS0A +HhilpvIiUps80SXYUXg5vqNmcy8SACvxpC5dTVBU2n+4OVXQY/35Il5ZOrUX3U7a +arfVp/KaQF7Oncu3x8F6Tp1ibUwmoyAV6OYqqs128nEPwkNbJvwrLY3aEBm+NIzm +gMlLRjj6EP84TVWgOsenQCS4l957f0QoNVxQ3f+GWdOiZZJFsv//ndsflng8zPlF +bGZy8c1TxDZfOD0/kW3Nx05c9X0EHKOEoDUc0p4qntrWlflxcvLONCgv1gZuPMF+ +jMsPFP81eu3rkEUxefJ1qbvvGuW0cbzfwiStv7iGQ+Skh/vcoM0qw6p+csNKyHVO +8nYFcs9kD8067zMnyuqiUHASfZ4rPqTji0iiPC5kZn6N0YSgz2bybkXcoqmy3m6y +qs0S+RD99o2vCLhW46hZyKAgUyTU1DW42EmnZkPrLoqV7uin8fAwPO/98Q/b3Rkr +zBRtyTEbooHvOCL8limiRtDl+5LMcjRFNWk8AN+9vHMsYurXPNOCnd8n2Z4MbT2U +AhpoAD/+8HXp0InBJ/sclITVAc6tPb2CbJW6mrFezH8Ri+/6u+zSF84JDd9ZrCOz +oIshiGZmhP5mIuspVrxgKlm78a56vQrygpqzvuSSYk3zIJxmhEkZhw09/ga+rhyB +pkKn7GRyZTfKjwt5nnvW5/bmQndTa13j+7RhkRgBSvU +-> piv-p256 vRzPNw Awpc8paUfKnP6r0bYsaoeDE9GVSnads4/a3jCVScgS4V +YydKOS09kyZDYN843SHIsYUimtSQKvGhIuycPWOFojc +-> piv-p256 zqq/iw A54xbcufPkLpTD+N47AiIe/xZ/0vA5kDJ4p3rIZw0a4A +1WFP2K3tfUxtdKDBEmT3cx/u1i5nCzFR7cK4kN3WjC4 +-> ssh-ed25519 YFSOsg L0lPSkoPVRKGlJ9MzkJx+cQvnZw/5m/j/JO4aRzd52Q +o/N7zQkvbGGoadiJSvL6lfuP63uqzxEIxDtIg4tgKIo +-> ssh-ed25519 iHV63A qfLWZhbDisCSJ4vFFTR+XpRUR0WViuAqarf56M0ekT4 +ZSWW34pFRr0M2jFhnphIPJ5ch37ASM6OgTzyHSo0KAs +-> ssh-ed25519 BVsyTA JcFezSIfTF+AP8LYfFqz+wIpUrE0aoc1usiLtWxAPQE +F9uhFyCPK46kIy+ud4V5/ESacQgc9R0JV+JTEZO6nBI +-> ssh-ed25519 +3V2lQ G4yT1e7B5O2Gy6tusRMxuWOFScynWfFY5AjrJvxMK1o +n1OVFRqzijWlc+B93cBNdFPz+8CBYOsI5hpF1wz7xr0 +--- 61u55uUc7z59iHF1IeyBLmcR6u7STUhpOPb/ODf75Vc +<$kxp H:}*/T$bJ \F*Wz6 <̹>e?񼐟6ڵ~! \ No newline at end of file diff --git a/secrets/secrets.nix b/secrets/secrets.nix index 94a3b11..36202b5 100644 --- a/secrets/secrets.nix +++ b/secrets/secrets.nix @@ -60,6 +60,7 @@ in "grafana-keycloak-client-secret.age".publicKeys = flora6Keys ++ adminKeys; "grafana-smtp-password.age".publicKeys = flora6Keys ++ adminKeys; + "alertmanager-envfile.age".publicKeys = flora6Keys ++ adminKeys; "nachtigall-metrics-nginx-basic-auth.age".publicKeys = nachtigallKeys ++ adminKeys; "nachtigall-metrics-prometheus-basic-auth-password.age".publicKeys = flora6Keys ++ nachtigallKeys ++ adminKeys;