Add alertmanager config - part 2 #189

Merged
teutat3s merged 9 commits from alertmanager into main 2024-05-15 15:23:59 +00:00
4 changed files with 34 additions and 25 deletions
Showing only changes of commit 9245fa6797 - Show all commits

View file

@ -15,6 +15,7 @@
file = "${flake.self}/secrets/grafana-smtp-password.age"; file = "${flake.self}/secrets/grafana-smtp-password.age";
mode = "644"; mode = "644";
owner = "grafana"; owner = "grafana";
group = "prometheus";
}; };
age.secrets.grafana-keycloak-client-secret = { age.secrets.grafana-keycloak-client-secret = {
file = "${flake.self}/secrets/grafana-keycloak-client-secret.age"; file = "${flake.self}/secrets/grafana-keycloak-client-secret.age";

View file

@ -51,12 +51,18 @@ lib.mapAttrsToList
# description = ''{{ $labels.instance }} {{ $labels.job }} {{ $labels.path }} has been lagging by more than 1MB for more than 15m.''; # description = ''{{ $labels.instance }} {{ $labels.job }} {{ $labels.path }} has been lagging by more than 1MB for more than 15m.'';
# }; # };
filesystem_full_80percent = { filesystem_root_full_80percent = {
condition = ''100 - ((node_filesystem_avail_bytes{fstype!="rootfs",mountpoint="/"} * 100) / node_filesystem_size_bytes{fstype!="rootfs",mountpoint="/"}) > 80''; condition = ''100 - ((node_filesystem_avail_bytes{fstype!="rootfs",mountpoint="/"} * 100) / node_filesystem_size_bytes{fstype!="rootfs",mountpoint="/"}) > 80'';
time = "10m"; time = "10m";
description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 20% space left on its filesystem."; description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 20% space left on its filesystem.";
}; };
filesystem_data_full_80percent = {
condition = ''100 - ((node_filesystem_avail_bytes{fstype!="rootfs",mountpoint="/var/lib"} * 100) / node_filesystem_size_bytes{fstype!="rootfs",mountpoint="/var/lib"}) > 80'';
time = "10m";
description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 20% space left on its filesystem.";
};
# filesystem_inodes_full = { # filesystem_inodes_full = {
# condition = ''disk_inodes_free / disk_inodes_total < 0.10''; # condition = ''disk_inodes_free / disk_inodes_total < 0.10'';
# time = "10m"; # time = "10m";
@ -102,11 +108,11 @@ lib.mapAttrsToList
# "homeassistant notification {{$labels.entity}} ({{$labels.friendly_name}}): {{$value}}"; # "homeassistant notification {{$labels.entity}} ({{$labels.friendly_name}}): {{$value}}";
# }; # };
swap_using_20percent = { #swap_using_20percent = {
condition = "node_memory_SwapTotal_bytes - (node_memory_SwapCached_bytes + node_memory_SwapFree_bytes) > node_memory_SwapTotal_bytes * 0.2"; # condition = "node_memory_SwapTotal_bytes - (node_memory_SwapCached_bytes + node_memory_SwapFree_bytes) > node_memory_SwapTotal_bytes * 0.2";
time = "30m"; # time = "30m";
description = "{{$labels.instance}} is using 20% of its swap space for at least 30 minutes."; # description = "{{$labels.instance}} is using 20% of its swap space for at least 30 minutes.";
}; #};
systemd_service_failed = { systemd_service_failed = {
condition = ''node_systemd_unit_state{state="failed"} == 1''; condition = ''node_systemd_unit_state{state="failed"} == 1'';
@ -118,10 +124,10 @@ lib.mapAttrsToList
description = "{{$labels.instance}} not backed up for more than 24 hours. ({{$value}})"; description = "{{$labels.instance}} not backed up for more than 24 hours. ({{$value}})";
}; };
host_down = { #host_down = {
condition = ''up{job="node-stats", instance!~"ahorn.wireguard:9100|kartoffel.wireguard:9100|mega.wireguard:9100"} == 0''; # condition = ''up{job="node-stats", instance!~"ahorn.wireguard:9100|kartoffel.wireguard:9100|mega.wireguard:9100"} == 0'';
description = "{{$labels.instance}} is down!"; # description = "{{$labels.instance}} is down!";
}; #};
# service_not_running = { # service_not_running = {
# condition = ''systemd_units_active_code{name=~"teamspeak3-server.service|tt-rss.service", sub!="running"}''; # condition = ''systemd_units_active_code{name=~"teamspeak3-server.service|tt-rss.service", sub!="running"}'';
@ -166,10 +172,10 @@ lib.mapAttrsToList
description = "{{$labels.instance}}: ping probe from {{$labels.source}} is encountering high latency!"; description = "{{$labels.instance}}: ping probe from {{$labels.source}} is encountering high latency!";
}; };
*/ */
http_status = { #http_status = {
condition = ''probe_http_status_code{instance!~"https://megaclan3000.de"} != 200''; # condition = ''probe_http_status_code{instance!~"https://pub.solar"} != 200'';
description = "http request failed from {{$labels.instance}}: {{$labels.result}}!"; # description = "http request failed from {{$labels.instance}}: {{$labels.result}}!";
}; #};
/* /*
http_match_failed = { http_match_failed = {
condition = "http_response_response_string_match == 0"; condition = "http_response_response_string_match == 0";
@ -192,10 +198,10 @@ lib.mapAttrsToList
description = "{{$labels.instance}}: healtcheck {{$labels.job}} fails!"; description = "{{$labels.instance}}: healtcheck {{$labels.job}} fails!";
}; };
*/ */
cert_expiry = { #cert_expiry = {
condition = "(probe_ssl_earliest_cert_expiry - time())/(3600*24) < 30"; # condition = "(probe_ssl_earliest_cert_expiry - time())/(3600*24) < 30";
description = "{{$labels.instance}}: The TLS certificate will expire in less than 30 days: {{$value}}s"; # description = "{{$labels.instance}}: The TLS certificate will expire in less than 30 days: {{$value}}s";
}; #};
# ignore devices that disabled S.M.A.R.T (example if attached via USB) # ignore devices that disabled S.M.A.R.T (example if attached via USB)

View file

@ -17,6 +17,8 @@
output discard output discard
''; '';
extraConfig = '' extraConfig = ''
bind 10.7.6.2 fd00:fae:fae:fae:fae:2::
tls internal
reverse_proxy :${toString config.services.prometheus.alertmanager.port} reverse_proxy :${toString config.services.prometheus.alertmanager.port}
''; '';
}; };
@ -101,7 +103,7 @@
alertmanager = { alertmanager = {
enable = true; enable = true;
# port = 9093; # Default # port = 9093; # Default
webExternalUrl = "https://alerts.pub.solar"; # TODO use a proper url? webExternalUrl = "https://alerts.pub.solar";
# environmentFile = "${config.age.secrets.nachtigall-alertmanager-envfile.path}"; # environmentFile = "${config.age.secrets.nachtigall-alertmanager-envfile.path}";
configuration = { configuration = {
@ -120,11 +122,11 @@
email_configs = [ email_configs = [
{ {
send_resolved = true; send_resolved = true;
to = "TODO"; to = "admins@pub.solar";
from = "alerts@pub.solar"; from = "alerts@pub.solar";
smarthost = "TODO"; smarthost = "mail.greenbaum.zone:465";
auth_username = "TODO"; auth_username = "admins@pub.solar";
auth_password_file = "${config.age.secrets.nachtigall-alertmanager-smtp-password.path}"; auth_password_file = "${config.age.secrets.grafana-smtp-password.path}";
require_tls = true; require_tls = true;
} }
]; ];

View file

@ -21,8 +21,8 @@ resource "namecheap_domain_records" "pub-solar" {
} }
record { record {
hostname = "alerts" hostname = "alerts"
type = "CNAME" type = "A"
address = "flora-6.pub.solar." address = "10.7.6.2"
} }
record { record {
hostname = "git" hostname = "git"