Add alertmanager config #169

Closed
pinpox wants to merge 4 commits from pinpox/infra:main into main
4 changed files with 391 additions and 73 deletions

View file

@ -0,0 +1,260 @@
{ lib }:
let
# docker's filesystems disappear quickly, leading to false positives
deviceFilter = ''path!~"^(/var/lib/docker|/nix/store).*"'';
in
lib.mapAttrsToList
(name: opts: {
alert = name;
expr = opts.condition;
for = opts.time or "2m";
labels = { };
annotations.description = opts.description;
})
({
# prometheus_too_many_restarts = {
# condition = ''changes(process_start_time_seconds{job=~"prometheus|alertmanager"}[15m]) > 2'';
# description = "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.";
# };
# alert_manager_config_not_synced = {
# condition = ''count(count_values("config_hash", alertmanager_config_hash)) > 1'';
# description = "Configurations of AlertManager cluster instances are out of sync.";
# };
#alert_manager_e2e_dead_man_switch = {
# condition = "vector(1)";
# description = "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.";
#};
# prometheus_not_connected_to_alertmanager = {
# condition = "prometheus_notifications_alertmanagers_discovered < 1";
# description = "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS = {{ $labels }}";
# };
# prometheus_rule_evaluation_failures = {
# condition = "increase(prometheus_rule_evaluation_failures_total[3m]) > 0";
# description = "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}";
# };
# prometheus_template_expansion_failures = {
# condition = "increase(prometheus_template_text_expansion_failures_total[3m]) > 0";
# time = "0m";
# description = "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}";
# };
# promtail_file_lagging = {
# condition = ''abs(promtail_file_bytes_total - promtail_read_bytes_total) > 1e6'';
# time = "15m";
# description = ''{{ $labels.instance }} {{ $labels.job }} {{ $labels.path }} has been lagging by more than 1MB for more than 15m.'';
# };
filesystem_full_80percent = {
condition = ''
100 - ((node_filesystem_avail_bytes{fstype!="rootfs",mountpoint="/"} * 100) / node_filesystem_size_bytes{fstype!="rootfs",mountpoint="/"}) > 80'';
time = "10m";
description =
"{{$labels.instance}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 20% space left on its filesystem.";
};
# filesystem_inodes_full = {
# condition = ''disk_inodes_free / disk_inodes_total < 0.10'';
# time = "10m";
# description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 10% inodes left on its filesystem.";
# };
# daily_task_not_run = {
# # give 6 hours grace period
# condition = ''time() - task_last_run{state="ok",frequency="daily"} > (24 + 6) * 60 * 60'';
# description = "{{$labels.instance}}: {{$labels.name}} was not run in the last 24h";
# };
# daily_task_failed = {
# condition = ''task_last_run{state="fail"}'';
# description = "{{$labels.instance}}: {{$labels.name}} failed to run";
# };
# } // (lib.genAttrs [
# "borgbackup-turingmachine"
# "borgbackup-eve"
# "borgbackup-datastore"
# ]
# (name: {
# condition = ''absent_over_time(task_last_run{name="${name}"}[1d])'';
# description = "status of ${name} is unknown: no data for a day";
# }))
# // {
# borgbackup_matchbox_not_run = {
# # give 6 hours grace period
# condition = ''time() - task_last_run{state="ok",frequency="daily",name="borgbackup-matchbox"} > 7 * 24 * 60 * 60'';
# description = "{{$labels.instance}}: {{$labels.name}} was not run in the last week";
# };
# borgbackup_matchbox = {
# condition = ''absent_over_time(task_last_run{name="borgbackup-matchbox"}[7d])'';
# description = "status of borgbackup-matchbox is unknown: no data for a week";
# };
# homeassistant = {
# condition = ''
# homeassistant_entity_available{domain="persistent_notification", entity!="persistent_notification.http_login"} >= 0'';
# description =
# "homeassistant notification {{$labels.entity}} ({{$labels.friendly_name}}): {{$value}}";
# };
swap_using_20percent = {
condition =
"node_memory_SwapTotal_bytes - (node_memory_SwapCached_bytes + node_memory_SwapFree_bytes) > node_memory_SwapTotal_bytes * 0.2";
time = "30m";
description =
"{{$labels.instance}} is using 20% of its swap space for at least 30 minutes.";
};
systemd_service_failed = {
condition = ''node_systemd_unit_state{state="failed"} == 1'';
description =
"{{$labels.instance}} failed to (re)start service {{$labels.name}}.";
};
restic_backup_too_old = {
condition = ''(time() - restic_snapshots_latest_time)/(60*60) > 24'';
description = "{{$labels.instance}} not backed up for more than 24 hours. ({{$value}})";
};
host_down = {
condition = ''up{job="node-stats", instance!~"ahorn.wireguard:9100|kartoffel.wireguard:9100|mega.wireguard:9100"} == 0'';
description = "{{$labels.instance}} is down!";
};
# service_not_running = {
# condition = ''systemd_units_active_code{name=~"teamspeak3-server.service|tt-rss.service", sub!="running"}'';
# description = "{{$labels.instance}} should have a running {{$labels.name}}.";
# };
ram_using_90percent = {
condition =
"node_memory_Buffers_bytes + node_memory_MemFree_bytes + node_memory_Cached_bytes < node_memory_MemTotal_bytes * 0.1";
time = "1h";
description =
"{{$labels.instance}} is using at least 90% of its RAM for at least 1 hour.";
};
cpu_using_90percent = {
condition = ''
100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) >= 90'';
time = "10m";
description =
"{{$labels.instance}} is running with cpu usage > 90% for at least 10 minutes: {{$value}}";
};
reboot = {
condition = "node_boot_time_seconds < 300";
description = "{{$labels.instance}} just rebooted.";
};
uptime = {
condition = "(time() - node_boot_time_seconds ) / (60*60*24) > 30";
description =
"Uptime monster: {{$labels.instance}} has been up for more than 30 days.";
};
flake_nixpkgs_outdated = {
condition = ''
(time() - flake_input_last_modified{input="nixpkgs"}) / (60*60*24) > 30'';
description =
"Nixpkgs outdated: Nixpkgs on {{$labels.instance}} has not been updated in 30 days";
};
/* ping = {
condition = "ping_result_code{type!='mobile'} != 0";
description = "{{$labels.url}}: ping from {{$labels.instance}} has failed!";
};
ping_high_latency = {
condition = "ping_average_response_ms{type!='mobile'} > 5000";
description = "{{$labels.instance}}: ping probe from {{$labels.source}} is encountering high latency!";
};
*/
http_status = {
condition = ''
probe_http_status_code{instance!~"https://megaclan3000.de"} != 200'';
description =
"http request failed from {{$labels.instance}}: {{$labels.result}}!";
};
/* http_match_failed = {
condition = "http_response_response_string_match == 0";
description = "{{$labels.server}} : http body not as expected; status code: {{$labels.status_code}}!";
};
dns_query = {
condition = "dns_query_result_code != 0";
description = "{{$labels.domain}} : could retrieve A record {{$labels.instance}} from server {{$labels.server}}: {{$labels.result}}!";
};
secure_dns_query = {
condition = "secure_dns_state != 0";
description = "{{$labels.domain}} : could retrieve A record {{$labels.instance}} from server {{$labels.server}}: {{$labels.result}} for protocol {{$labels.protocol}}!";
};
connection_failed = {
condition = "net_response_result_code != 0";
description = "{{$labels.server}}: connection to {{$labels.port}}({{$labels.protocol}}) failed from {{$labels.instance}}";
};
healthchecks = {
condition = "hc_check_up == 0";
description = "{{$labels.instance}}: healtcheck {{$labels.job}} fails!";
};
*/
cert_expiry = {
condition = "(probe_ssl_earliest_cert_expiry - time())/(3600*24) < 30";
description =
"{{$labels.instance}}: The TLS certificate will expire in less than 30 days: {{$value}}s";
};
# ignore devices that disabled S.M.A.R.T (example if attached via USB)
# smart_errors = {
# condition = ''smart_device_health_ok{enabled!="Disabled"} != 1'';
# description =
# "{{$labels.instance}}: S.M.A.R.T reports: {{$labels.device}} ({{$labels.model}}) has errors.";
# };
oom_kills = {
condition = "increase(node_vmstat_oom_kill[5m]) > 0";
description = "{{$labels.instance}}: OOM kill detected";
};
/* unusual_disk_read_latency = {
condition =
"rate(diskio_read_time[1m]) / rate(diskio_reads[1m]) > 0.1 and rate(diskio_reads[1m]) > 0";
description = ''
{{$labels.instance}}: Disk latency is growing (read operations > 100ms)
'';
};
unusual_disk_write_latency = {
condition =
"rate(diskio_write_time[1m]) / rate(diskio_write[1m]) > 0.1 and rate(diskio_write[1m]) > 0";
description = ''
{{$labels.instance}}: Disk latency is growing (write operations > 100ms)
'';
};
*/
host_memory_under_memory_pressure = {
condition = "rate(node_vmstat_pgmajfault[1m]) > 1000";
description =
"{{$labels.instance}}: The node is under heavy memory pressure. High rate of major page faults: {{$value}}";
};
# ext4_errors = {
# condition = "ext4_errors_value > 0";
# description =
# "{{$labels.instance}}: ext4 has reported {{$value}} I/O errors: check /sys/fs/ext4/*/errors_count";
# };
# alerts_silences_changed = {
# condition = ''abs(delta(alertmanager_silences{state="active"}[1h])) >= 1'';
# description =
# "alertmanager: number of active silences has changed: {{$value}}";
# };
})

View file

@ -37,6 +37,14 @@
reverse_proxy :${toString config.services.loki.configuration.server.http_listen_port}
'';
};
"alerts.pub.solar" = {
logFormat = lib.mkForce ''
output discard
'';
extraConfig = ''
Review

Here we'd want to use the bind directive to only listen on the WireGuard IP, exposing alertmanager only internally.

bind 10.7.6.2
tls internal

and then

reverse_proxy :${toString config.services.prometheus.alertmanager.port}

To have valid Let's Encrypt certificates, we'd need to configure DNS challenge, but that could be a future task IMO.

Here we'd want to use the `bind` directive to only listen on the WireGuard IP, exposing alertmanager only internally. ``` bind 10.7.6.2 tls internal ``` and then ``` reverse_proxy :${toString config.services.prometheus.alertmanager.port} ``` To have valid Let's Encrypt certificates, we'd need to configure DNS challenge, but that could be a future task IMO.
Review

Sure feel free to reuse as you wish. I might be able to join tomorrow if you are still hakking on it

Sure feel free to reuse as you wish. I might be able to join tomorrow if you are still hakking on it
reverse_proxy 10.7.6.2:${toString config.services.prometheus.alertmanager.port}
'';
};
"grafana.pub.solar" = {
logFormat = lib.mkForce ''
output discard

View file

@ -65,5 +65,50 @@
}];
}
];
ruleFiles = [
(pkgs.writeText "prometheus-rules.yml" (builtins.toJSON {
groups = [{
name = "alerting-rules";
rules = import ./alert-rules.nix { inherit lib; };
}];
}))
];
alertmanagers = [{ static_configs = [{ targets = [ "localhost:9093" ]; }]; }];
alertmanager = {
enable = true;
# port = 9093; # Default
webExternalUrl = "https://alerts.pub.solar"; # TODO use a proper url?
pinpox marked this conversation as resolved
Review

You're missing the nginx config in front of the alertmanager

You're missing the nginx config in front of the alertmanager
Review

fixed by d58209ef93

fixed by d58209ef93385735e00084fb02015918985dc06d
# environmentFile = "${config.age.secrets.nachtigall-alertmanager-envfile.path}";
configuration = {
route = {
receiver = "all";
group_by = [ "instance" ];
group_wait = "30s";
group_interval = "2m";
repeat_interval = "24h";
};
receivers = [{
name = "all";
# Email config documentation: https://prometheus.io/docs/alerting/latest/configuration/#email_config
email_configs = [{
send_resolved = true;
to = "TODO";
from = "alerts@pub.solar";
smarthost = "TODO";
auth_username = "TODO";
auth_password_file = "${config.age.secrets.nachtigall-alertmanager-smtp-password.path}";
require_tls = true;
}];
# TODO:
# For matrix notifications, look into: https://github.com/pinpox/matrix-hook and add a webhook
# webhook_configs = [ { url = "http://127.0.0.1:11000/alert"; } ];
}];
};
};
};
}

View file

@ -1,181 +1,186 @@
# https://registry.terraform.io/providers/namecheap/namecheap/latest/docs
resource "namecheap_domain_records" "pub-solar" {
domain = "pub.solar"
mode = "OVERWRITE"
domain = "pub.solar"
mode = "OVERWRITE"
email_type = "MX"
record {
hostname = "flora-6"
type = "A"
address = "80.71.153.210"
type = "A"
address = "80.71.153.210"
}
record {
hostname = "auth"
type = "CNAME"
address = "nachtigall.pub.solar."
type = "CNAME"
address = "nachtigall.pub.solar."
}
record {
hostname = "ci"
type = "A"
address = "80.71.153.210"
type = "A"
address = "80.71.153.210"
}
record {
hostname = "alerts"
type = "CNAME"
address = "flora-6.pub.solar."
}
record {
hostname = "git"
type = "CNAME"
address = "nachtigall.pub.solar."
type = "CNAME"
address = "nachtigall.pub.solar."
}
record {
hostname = "stream"
type = "CNAME"
address = "nachtigall.pub.solar."
type = "CNAME"
address = "nachtigall.pub.solar."
}
record {
hostname = "list"
type = "CNAME"
address = "nachtigall.pub.solar."
type = "CNAME"
address = "nachtigall.pub.solar."
}
record {
hostname = "obs-portal"
type = "A"
address = "80.71.153.210"
type = "A"
address = "80.71.153.210"
}
record {
hostname = "vpn"
type = "A"
address = "80.71.153.210"
type = "A"
address = "80.71.153.210"
}
record {
hostname = "cache"
type = "A"
address = "95.217.225.160"
type = "A"
address = "95.217.225.160"
}
record {
hostname = "factorio"
type = "A"
address = "80.244.242.2"
type = "A"
address = "80.244.242.2"
}
record {
hostname = "collabora"
type = "CNAME"
address = "nachtigall.pub.solar."
type = "CNAME"
address = "nachtigall.pub.solar."
}
record {
hostname = "@"
type = "ALIAS"
address = "nachtigall.pub.solar."
ttl = 300
type = "ALIAS"
address = "nachtigall.pub.solar."
ttl = 300
}
record {
hostname = "chat"
type = "CNAME"
address = "nachtigall.pub.solar."
type = "CNAME"
address = "nachtigall.pub.solar."
}
record {
hostname = "cloud"
type = "CNAME"
address = "nachtigall.pub.solar."
type = "CNAME"
address = "nachtigall.pub.solar."
}
record {
hostname = "turn"
type = "CNAME"
address = "nachtigall.pub.solar."
type = "CNAME"
address = "nachtigall.pub.solar."
}
record {
hostname = "grafana"
type = "A"
address = "80.71.153.210"
type = "A"
address = "80.71.153.210"
}
record {
hostname = "hpb"
type = "A"
address = "80.71.153.239"
type = "A"
address = "80.71.153.239"
}
record {
hostname = "files"
type = "CNAME"
address = "nachtigall.pub.solar."
type = "CNAME"
address = "nachtigall.pub.solar."
}
record {
hostname = "search"
type = "CNAME"
address = "nachtigall.pub.solar."
type = "CNAME"
address = "nachtigall.pub.solar."
}
record {
hostname = "wiki"
type = "CNAME"
address = "nachtigall.pub.solar."
type = "CNAME"
address = "nachtigall.pub.solar."
}
record {
hostname = "mastodon"
type = "CNAME"
address = "nachtigall.pub.solar."
type = "CNAME"
address = "nachtigall.pub.solar."
}
record {
hostname = "matrix"
type = "CNAME"
address = "nachtigall.pub.solar."
type = "CNAME"
address = "nachtigall.pub.solar."
}
record {
hostname = "tmate"
type = "CNAME"
address = "nachtigall.pub.solar."
type = "CNAME"
address = "nachtigall.pub.solar."
}
record {
hostname = "www"
type = "CNAME"
address = "nachtigall.pub.solar."
type = "CNAME"
address = "nachtigall.pub.solar."
}
record {
hostname = "@"
type = "TXT"
address = "v=spf1 include:spf.greenbaum.zone a:list.pub.solar ~all"
type = "TXT"
address = "v=spf1 include:spf.greenbaum.zone a:list.pub.solar ~all"
}
record {
hostname = "list"
type = "TXT"
address = "v=spf1 a:list.pub.solar ?all"
type = "TXT"
address = "v=spf1 a:list.pub.solar ?all"
}
record {
hostname = "_dmarc"
type = "TXT"
address = "v=DMARC1; p=reject;"
type = "TXT"
address = "v=DMARC1; p=reject;"
}
record {
hostname = "_dmarc.list"
type = "TXT"
address = "v=DMARC1; p=reject;"
type = "TXT"
address = "v=DMARC1; p=reject;"
}
record {
hostname = "modoboa._domainkey"
type = "TXT"
address = "v=DKIM1;k=rsa;p=MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAx/EqLMpk0MyL1aQ0JVG44ypTRbZBVA13MFjEntxAvowaWtq1smRbnEwTTKgqUOrUyaM4dVmli1dedne4mk/ncqRAm02KuhtTY+5wXfhTKK53EhqehbKwH+Qvzb12983Qwdau/QTHiFHwXHufMaSsCvd9CRWCp9q68Q7noQqndJeLHT6L0eECd2Zk3ZxJuh+Fxdb7+Kw68Tf6z13Rs+MU01qLM7x0jmSQHa4cv2pk+7NTGMBRp6fVskfbqev5nFkZWJ7rhXEbP9Eukd/L3ro/ubs1quWJotG02gPRKE8fgkm1Ytlws1/pnqpuvKXQS1HzBEP1X2ExezJMzQ1SnZCigQIDAQAB"
type = "TXT"
address = "v=DKIM1;k=rsa;p=MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAx/EqLMpk0MyL1aQ0JVG44ypTRbZBVA13MFjEntxAvowaWtq1smRbnEwTTKgqUOrUyaM4dVmli1dedne4mk/ncqRAm02KuhtTY+5wXfhTKK53EhqehbKwH+Qvzb12983Qwdau/QTHiFHwXHufMaSsCvd9CRWCp9q68Q7noQqndJeLHT6L0eECd2Zk3ZxJuh+Fxdb7+Kw68Tf6z13Rs+MU01qLM7x0jmSQHa4cv2pk+7NTGMBRp6fVskfbqev5nFkZWJ7rhXEbP9Eukd/L3ro/ubs1quWJotG02gPRKE8fgkm1Ytlws1/pnqpuvKXQS1HzBEP1X2ExezJMzQ1SnZCigQIDAQAB"
}
record {
hostname = "@"
type = "MX"
address = "mail.greenbaum.zone."
mx_pref = "0"
type = "MX"
address = "mail.greenbaum.zone."
mx_pref = "0"
}
record {
hostname = "list"
type = "MX"
address = "list.pub.solar."
mx_pref = "0"
type = "MX"
address = "list.pub.solar."
mx_pref = "0"
}
record {
hostname = "nachtigall"
type = "A"
address = "138.201.80.102"
type = "A"
address = "138.201.80.102"
}
record {
hostname = "nachtigall"
type = "AAAA"
address = "2a01:4f8:172:1c25::1"
type = "AAAA"
address = "2a01:4f8:172:1c25::1"
}
record {
hostname = "matrix.test"
type = "CNAME"
address = "nachtigall.pub.solar."
type = "CNAME"
address = "nachtigall.pub.solar."
}
# SRV records can only be changed via NameCheap Web UI
# add comment