alertmanager: finalize init

This commit is contained in:
teutat3s 2024-05-12 22:17:58 +02:00
parent a8a8155114
commit 9245fa6797
Signed by: teutat3s
GPG key ID: 4FA1D3FA524F22C1
4 changed files with 34 additions and 25 deletions

View file

@ -15,6 +15,7 @@
file = "${flake.self}/secrets/grafana-smtp-password.age";
mode = "644";
owner = "grafana";
group = "prometheus";
};
age.secrets.grafana-keycloak-client-secret = {
file = "${flake.self}/secrets/grafana-keycloak-client-secret.age";

View file

@ -51,12 +51,18 @@ lib.mapAttrsToList
# description = ''{{ $labels.instance }} {{ $labels.job }} {{ $labels.path }} has been lagging by more than 1MB for more than 15m.'';
# };
filesystem_full_80percent = {
filesystem_root_full_80percent = {
condition = ''100 - ((node_filesystem_avail_bytes{fstype!="rootfs",mountpoint="/"} * 100) / node_filesystem_size_bytes{fstype!="rootfs",mountpoint="/"}) > 80'';
time = "10m";
description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 20% space left on its filesystem.";
};
filesystem_data_full_80percent = {
condition = ''100 - ((node_filesystem_avail_bytes{fstype!="rootfs",mountpoint="/var/lib"} * 100) / node_filesystem_size_bytes{fstype!="rootfs",mountpoint="/var/lib"}) > 80'';
time = "10m";
description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 20% space left on its filesystem.";
};
# filesystem_inodes_full = {
# condition = ''disk_inodes_free / disk_inodes_total < 0.10'';
# time = "10m";
@ -102,11 +108,11 @@ lib.mapAttrsToList
# "homeassistant notification {{$labels.entity}} ({{$labels.friendly_name}}): {{$value}}";
# };
swap_using_20percent = {
condition = "node_memory_SwapTotal_bytes - (node_memory_SwapCached_bytes + node_memory_SwapFree_bytes) > node_memory_SwapTotal_bytes * 0.2";
time = "30m";
description = "{{$labels.instance}} is using 20% of its swap space for at least 30 minutes.";
};
#swap_using_20percent = {
# condition = "node_memory_SwapTotal_bytes - (node_memory_SwapCached_bytes + node_memory_SwapFree_bytes) > node_memory_SwapTotal_bytes * 0.2";
# time = "30m";
# description = "{{$labels.instance}} is using 20% of its swap space for at least 30 minutes.";
#};
systemd_service_failed = {
condition = ''node_systemd_unit_state{state="failed"} == 1'';
@ -118,10 +124,10 @@ lib.mapAttrsToList
description = "{{$labels.instance}} not backed up for more than 24 hours. ({{$value}})";
};
host_down = {
condition = ''up{job="node-stats", instance!~"ahorn.wireguard:9100|kartoffel.wireguard:9100|mega.wireguard:9100"} == 0'';
description = "{{$labels.instance}} is down!";
};
#host_down = {
# condition = ''up{job="node-stats", instance!~"ahorn.wireguard:9100|kartoffel.wireguard:9100|mega.wireguard:9100"} == 0'';
# description = "{{$labels.instance}} is down!";
#};
# service_not_running = {
# condition = ''systemd_units_active_code{name=~"teamspeak3-server.service|tt-rss.service", sub!="running"}'';
@ -166,10 +172,10 @@ lib.mapAttrsToList
description = "{{$labels.instance}}: ping probe from {{$labels.source}} is encountering high latency!";
};
*/
http_status = {
condition = ''probe_http_status_code{instance!~"https://megaclan3000.de"} != 200'';
description = "http request failed from {{$labels.instance}}: {{$labels.result}}!";
};
#http_status = {
# condition = ''probe_http_status_code{instance!~"https://pub.solar"} != 200'';
# description = "http request failed from {{$labels.instance}}: {{$labels.result}}!";
#};
/*
http_match_failed = {
condition = "http_response_response_string_match == 0";
@ -192,10 +198,10 @@ lib.mapAttrsToList
description = "{{$labels.instance}}: healtcheck {{$labels.job}} fails!";
};
*/
cert_expiry = {
condition = "(probe_ssl_earliest_cert_expiry - time())/(3600*24) < 30";
description = "{{$labels.instance}}: The TLS certificate will expire in less than 30 days: {{$value}}s";
};
#cert_expiry = {
# condition = "(probe_ssl_earliest_cert_expiry - time())/(3600*24) < 30";
# description = "{{$labels.instance}}: The TLS certificate will expire in less than 30 days: {{$value}}s";
#};
# ignore devices that disabled S.M.A.R.T (example if attached via USB)

View file

@ -17,6 +17,8 @@
output discard
'';
extraConfig = ''
bind 10.7.6.2 fd00:fae:fae:fae:fae:2::
tls internal
reverse_proxy :${toString config.services.prometheus.alertmanager.port}
'';
};
@ -101,7 +103,7 @@
alertmanager = {
enable = true;
# port = 9093; # Default
webExternalUrl = "https://alerts.pub.solar"; # TODO use a proper url?
webExternalUrl = "https://alerts.pub.solar";
# environmentFile = "${config.age.secrets.nachtigall-alertmanager-envfile.path}";
configuration = {
@ -120,11 +122,11 @@
email_configs = [
{
send_resolved = true;
to = "TODO";
to = "admins@pub.solar";
from = "alerts@pub.solar";
smarthost = "TODO";
auth_username = "TODO";
auth_password_file = "${config.age.secrets.nachtigall-alertmanager-smtp-password.path}";
smarthost = "mail.greenbaum.zone:465";
auth_username = "admins@pub.solar";
auth_password_file = "${config.age.secrets.grafana-smtp-password.path}";
require_tls = true;
}
];

View file

@ -21,8 +21,8 @@ resource "namecheap_domain_records" "pub-solar" {
}
record {
hostname = "alerts"
type = "CNAME"
address = "flora-6.pub.solar."
type = "A"
address = "10.7.6.2"
}
record {
hostname = "git"