Add alertmanager config - part 2 #189
|
@ -15,6 +15,7 @@
|
||||||
file = "${flake.self}/secrets/grafana-smtp-password.age";
|
file = "${flake.self}/secrets/grafana-smtp-password.age";
|
||||||
mode = "644";
|
mode = "644";
|
||||||
owner = "grafana";
|
owner = "grafana";
|
||||||
|
group = "prometheus";
|
||||||
};
|
};
|
||||||
age.secrets.grafana-keycloak-client-secret = {
|
age.secrets.grafana-keycloak-client-secret = {
|
||||||
file = "${flake.self}/secrets/grafana-keycloak-client-secret.age";
|
file = "${flake.self}/secrets/grafana-keycloak-client-secret.age";
|
||||||
|
|
|
@ -51,12 +51,18 @@ lib.mapAttrsToList
|
||||||
# description = ''{{ $labels.instance }} {{ $labels.job }} {{ $labels.path }} has been lagging by more than 1MB for more than 15m.'';
|
# description = ''{{ $labels.instance }} {{ $labels.job }} {{ $labels.path }} has been lagging by more than 1MB for more than 15m.'';
|
||||||
# };
|
# };
|
||||||
|
|
||||||
filesystem_full_80percent = {
|
filesystem_root_full_80percent = {
|
||||||
condition = ''100 - ((node_filesystem_avail_bytes{fstype!="rootfs",mountpoint="/"} * 100) / node_filesystem_size_bytes{fstype!="rootfs",mountpoint="/"}) > 80'';
|
condition = ''100 - ((node_filesystem_avail_bytes{fstype!="rootfs",mountpoint="/"} * 100) / node_filesystem_size_bytes{fstype!="rootfs",mountpoint="/"}) > 80'';
|
||||||
time = "10m";
|
time = "10m";
|
||||||
description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 20% space left on its filesystem.";
|
description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 20% space left on its filesystem.";
|
||||||
};
|
};
|
||||||
|
|
||||||
|
filesystem_data_full_80percent = {
|
||||||
|
condition = ''100 - ((node_filesystem_avail_bytes{fstype!="rootfs",mountpoint="/var/lib"} * 100) / node_filesystem_size_bytes{fstype!="rootfs",mountpoint="/var/lib"}) > 80'';
|
||||||
|
time = "10m";
|
||||||
|
description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 20% space left on its filesystem.";
|
||||||
|
};
|
||||||
|
|
||||||
# filesystem_inodes_full = {
|
# filesystem_inodes_full = {
|
||||||
# condition = ''disk_inodes_free / disk_inodes_total < 0.10'';
|
# condition = ''disk_inodes_free / disk_inodes_total < 0.10'';
|
||||||
# time = "10m";
|
# time = "10m";
|
||||||
|
@ -102,11 +108,11 @@ lib.mapAttrsToList
|
||||||
# "homeassistant notification {{$labels.entity}} ({{$labels.friendly_name}}): {{$value}}";
|
# "homeassistant notification {{$labels.entity}} ({{$labels.friendly_name}}): {{$value}}";
|
||||||
# };
|
# };
|
||||||
|
|
||||||
swap_using_20percent = {
|
#swap_using_20percent = {
|
||||||
condition = "node_memory_SwapTotal_bytes - (node_memory_SwapCached_bytes + node_memory_SwapFree_bytes) > node_memory_SwapTotal_bytes * 0.2";
|
# condition = "node_memory_SwapTotal_bytes - (node_memory_SwapCached_bytes + node_memory_SwapFree_bytes) > node_memory_SwapTotal_bytes * 0.2";
|
||||||
time = "30m";
|
# time = "30m";
|
||||||
description = "{{$labels.instance}} is using 20% of its swap space for at least 30 minutes.";
|
# description = "{{$labels.instance}} is using 20% of its swap space for at least 30 minutes.";
|
||||||
};
|
#};
|
||||||
|
|
||||||
systemd_service_failed = {
|
systemd_service_failed = {
|
||||||
condition = ''node_systemd_unit_state{state="failed"} == 1'';
|
condition = ''node_systemd_unit_state{state="failed"} == 1'';
|
||||||
|
@ -118,10 +124,10 @@ lib.mapAttrsToList
|
||||||
description = "{{$labels.instance}} not backed up for more than 24 hours. ({{$value}})";
|
description = "{{$labels.instance}} not backed up for more than 24 hours. ({{$value}})";
|
||||||
};
|
};
|
||||||
|
|
||||||
host_down = {
|
#host_down = {
|
||||||
condition = ''up{job="node-stats", instance!~"ahorn.wireguard:9100|kartoffel.wireguard:9100|mega.wireguard:9100"} == 0'';
|
# condition = ''up{job="node-stats", instance!~"ahorn.wireguard:9100|kartoffel.wireguard:9100|mega.wireguard:9100"} == 0'';
|
||||||
description = "{{$labels.instance}} is down!";
|
# description = "{{$labels.instance}} is down!";
|
||||||
};
|
#};
|
||||||
|
|
||||||
# service_not_running = {
|
# service_not_running = {
|
||||||
# condition = ''systemd_units_active_code{name=~"teamspeak3-server.service|tt-rss.service", sub!="running"}'';
|
# condition = ''systemd_units_active_code{name=~"teamspeak3-server.service|tt-rss.service", sub!="running"}'';
|
||||||
|
@ -166,10 +172,10 @@ lib.mapAttrsToList
|
||||||
description = "{{$labels.instance}}: ping probe from {{$labels.source}} is encountering high latency!";
|
description = "{{$labels.instance}}: ping probe from {{$labels.source}} is encountering high latency!";
|
||||||
};
|
};
|
||||||
*/
|
*/
|
||||||
http_status = {
|
#http_status = {
|
||||||
condition = ''probe_http_status_code{instance!~"https://megaclan3000.de"} != 200'';
|
# condition = ''probe_http_status_code{instance!~"https://pub.solar"} != 200'';
|
||||||
description = "http request failed from {{$labels.instance}}: {{$labels.result}}!";
|
# description = "http request failed from {{$labels.instance}}: {{$labels.result}}!";
|
||||||
};
|
#};
|
||||||
/*
|
/*
|
||||||
http_match_failed = {
|
http_match_failed = {
|
||||||
condition = "http_response_response_string_match == 0";
|
condition = "http_response_response_string_match == 0";
|
||||||
|
@ -192,10 +198,10 @@ lib.mapAttrsToList
|
||||||
description = "{{$labels.instance}}: healtcheck {{$labels.job}} fails!";
|
description = "{{$labels.instance}}: healtcheck {{$labels.job}} fails!";
|
||||||
};
|
};
|
||||||
*/
|
*/
|
||||||
cert_expiry = {
|
#cert_expiry = {
|
||||||
condition = "(probe_ssl_earliest_cert_expiry - time())/(3600*24) < 30";
|
# condition = "(probe_ssl_earliest_cert_expiry - time())/(3600*24) < 30";
|
||||||
description = "{{$labels.instance}}: The TLS certificate will expire in less than 30 days: {{$value}}s";
|
# description = "{{$labels.instance}}: The TLS certificate will expire in less than 30 days: {{$value}}s";
|
||||||
};
|
#};
|
||||||
|
|
||||||
# ignore devices that disabled S.M.A.R.T (example if attached via USB)
|
# ignore devices that disabled S.M.A.R.T (example if attached via USB)
|
||||||
|
|
||||||
|
|
|
@ -17,6 +17,8 @@
|
||||||
output discard
|
output discard
|
||||||
'';
|
'';
|
||||||
extraConfig = ''
|
extraConfig = ''
|
||||||
|
bind 10.7.6.2 fd00:fae:fae:fae:fae:2::
|
||||||
|
tls internal
|
||||||
reverse_proxy :${toString config.services.prometheus.alertmanager.port}
|
reverse_proxy :${toString config.services.prometheus.alertmanager.port}
|
||||||
'';
|
'';
|
||||||
};
|
};
|
||||||
|
@ -101,7 +103,7 @@
|
||||||
alertmanager = {
|
alertmanager = {
|
||||||
enable = true;
|
enable = true;
|
||||||
# port = 9093; # Default
|
# port = 9093; # Default
|
||||||
webExternalUrl = "https://alerts.pub.solar"; # TODO use a proper url?
|
webExternalUrl = "https://alerts.pub.solar";
|
||||||
# environmentFile = "${config.age.secrets.nachtigall-alertmanager-envfile.path}";
|
# environmentFile = "${config.age.secrets.nachtigall-alertmanager-envfile.path}";
|
||||||
configuration = {
|
configuration = {
|
||||||
|
|
||||||
|
@ -120,11 +122,11 @@
|
||||||
email_configs = [
|
email_configs = [
|
||||||
{
|
{
|
||||||
send_resolved = true;
|
send_resolved = true;
|
||||||
to = "TODO";
|
to = "admins@pub.solar";
|
||||||
from = "alerts@pub.solar";
|
from = "alerts@pub.solar";
|
||||||
smarthost = "TODO";
|
smarthost = "mail.greenbaum.zone:465";
|
||||||
auth_username = "TODO";
|
auth_username = "admins@pub.solar";
|
||||||
auth_password_file = "${config.age.secrets.nachtigall-alertmanager-smtp-password.path}";
|
auth_password_file = "${config.age.secrets.grafana-smtp-password.path}";
|
||||||
require_tls = true;
|
require_tls = true;
|
||||||
}
|
}
|
||||||
];
|
];
|
||||||
|
|
|
@ -21,8 +21,8 @@ resource "namecheap_domain_records" "pub-solar" {
|
||||||
}
|
}
|
||||||
record {
|
record {
|
||||||
hostname = "alerts"
|
hostname = "alerts"
|
||||||
type = "CNAME"
|
type = "A"
|
||||||
address = "flora-6.pub.solar."
|
address = "10.7.6.2"
|
||||||
}
|
}
|
||||||
record {
|
record {
|
||||||
hostname = "git"
|
hostname = "git"
|
||||||
|
|
Loading…
Reference in a new issue