forked from pub-solar/infra
style: treefmt with nixfmt-rfc-style
This commit is contained in:
parent
11f5557a7a
commit
a8a8155114
|
@ -52,11 +52,9 @@ lib.mapAttrsToList
|
||||||
# };
|
# };
|
||||||
|
|
||||||
filesystem_full_80percent = {
|
filesystem_full_80percent = {
|
||||||
condition = ''
|
condition = ''100 - ((node_filesystem_avail_bytes{fstype!="rootfs",mountpoint="/"} * 100) / node_filesystem_size_bytes{fstype!="rootfs",mountpoint="/"}) > 80'';
|
||||||
100 - ((node_filesystem_avail_bytes{fstype!="rootfs",mountpoint="/"} * 100) / node_filesystem_size_bytes{fstype!="rootfs",mountpoint="/"}) > 80'';
|
|
||||||
time = "10m";
|
time = "10m";
|
||||||
description =
|
description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 20% space left on its filesystem.";
|
||||||
"{{$labels.instance}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 20% space left on its filesystem.";
|
|
||||||
};
|
};
|
||||||
|
|
||||||
# filesystem_inodes_full = {
|
# filesystem_inodes_full = {
|
||||||
|
@ -105,17 +103,14 @@ lib.mapAttrsToList
|
||||||
# };
|
# };
|
||||||
|
|
||||||
swap_using_20percent = {
|
swap_using_20percent = {
|
||||||
condition =
|
condition = "node_memory_SwapTotal_bytes - (node_memory_SwapCached_bytes + node_memory_SwapFree_bytes) > node_memory_SwapTotal_bytes * 0.2";
|
||||||
"node_memory_SwapTotal_bytes - (node_memory_SwapCached_bytes + node_memory_SwapFree_bytes) > node_memory_SwapTotal_bytes * 0.2";
|
|
||||||
time = "30m";
|
time = "30m";
|
||||||
description =
|
description = "{{$labels.instance}} is using 20% of its swap space for at least 30 minutes.";
|
||||||
"{{$labels.instance}} is using 20% of its swap space for at least 30 minutes.";
|
|
||||||
};
|
};
|
||||||
|
|
||||||
systemd_service_failed = {
|
systemd_service_failed = {
|
||||||
condition = ''node_systemd_unit_state{state="failed"} == 1'';
|
condition = ''node_systemd_unit_state{state="failed"} == 1'';
|
||||||
description =
|
description = "{{$labels.instance}} failed to (re)start service {{$labels.name}}.";
|
||||||
"{{$labels.instance}} failed to (re)start service {{$labels.name}}.";
|
|
||||||
};
|
};
|
||||||
|
|
||||||
restic_backup_too_old = {
|
restic_backup_too_old = {
|
||||||
|
@ -134,19 +129,15 @@ lib.mapAttrsToList
|
||||||
# };
|
# };
|
||||||
|
|
||||||
ram_using_90percent = {
|
ram_using_90percent = {
|
||||||
condition =
|
condition = "node_memory_Buffers_bytes + node_memory_MemFree_bytes + node_memory_Cached_bytes < node_memory_MemTotal_bytes * 0.1";
|
||||||
"node_memory_Buffers_bytes + node_memory_MemFree_bytes + node_memory_Cached_bytes < node_memory_MemTotal_bytes * 0.1";
|
|
||||||
time = "1h";
|
time = "1h";
|
||||||
description =
|
description = "{{$labels.instance}} is using at least 90% of its RAM for at least 1 hour.";
|
||||||
"{{$labels.instance}} is using at least 90% of its RAM for at least 1 hour.";
|
|
||||||
};
|
};
|
||||||
|
|
||||||
cpu_using_90percent = {
|
cpu_using_90percent = {
|
||||||
condition = ''
|
condition = ''100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) >= 90'';
|
||||||
100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) >= 90'';
|
|
||||||
time = "10m";
|
time = "10m";
|
||||||
description =
|
description = "{{$labels.instance}} is running with cpu usage > 90% for at least 10 minutes: {{$value}}";
|
||||||
"{{$labels.instance}} is running with cpu usage > 90% for at least 10 minutes: {{$value}}";
|
|
||||||
};
|
};
|
||||||
|
|
||||||
reboot = {
|
reboot = {
|
||||||
|
@ -156,18 +147,16 @@ lib.mapAttrsToList
|
||||||
|
|
||||||
uptime = {
|
uptime = {
|
||||||
condition = "(time() - node_boot_time_seconds ) / (60*60*24) > 30";
|
condition = "(time() - node_boot_time_seconds ) / (60*60*24) > 30";
|
||||||
description =
|
description = "Uptime monster: {{$labels.instance}} has been up for more than 30 days.";
|
||||||
"Uptime monster: {{$labels.instance}} has been up for more than 30 days.";
|
|
||||||
};
|
};
|
||||||
|
|
||||||
flake_nixpkgs_outdated = {
|
flake_nixpkgs_outdated = {
|
||||||
condition = ''
|
condition = ''(time() - flake_input_last_modified{input="nixpkgs"}) / (60*60*24) > 30'';
|
||||||
(time() - flake_input_last_modified{input="nixpkgs"}) / (60*60*24) > 30'';
|
description = "Nixpkgs outdated: Nixpkgs on {{$labels.instance}} has not been updated in 30 days";
|
||||||
description =
|
|
||||||
"Nixpkgs outdated: Nixpkgs on {{$labels.instance}} has not been updated in 30 days";
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/* ping = {
|
/*
|
||||||
|
ping = {
|
||||||
condition = "ping_result_code{type!='mobile'} != 0";
|
condition = "ping_result_code{type!='mobile'} != 0";
|
||||||
description = "{{$labels.url}}: ping from {{$labels.instance}} has failed!";
|
description = "{{$labels.url}}: ping from {{$labels.instance}} has failed!";
|
||||||
};
|
};
|
||||||
|
@ -178,12 +167,11 @@ lib.mapAttrsToList
|
||||||
};
|
};
|
||||||
*/
|
*/
|
||||||
http_status = {
|
http_status = {
|
||||||
condition = ''
|
condition = ''probe_http_status_code{instance!~"https://megaclan3000.de"} != 200'';
|
||||||
probe_http_status_code{instance!~"https://megaclan3000.de"} != 200'';
|
description = "http request failed from {{$labels.instance}}: {{$labels.result}}!";
|
||||||
description =
|
|
||||||
"http request failed from {{$labels.instance}}: {{$labels.result}}!";
|
|
||||||
};
|
};
|
||||||
/* http_match_failed = {
|
/*
|
||||||
|
http_match_failed = {
|
||||||
condition = "http_response_response_string_match == 0";
|
condition = "http_response_response_string_match == 0";
|
||||||
description = "{{$labels.server}} : http body not as expected; status code: {{$labels.status_code}}!";
|
description = "{{$labels.server}} : http body not as expected; status code: {{$labels.status_code}}!";
|
||||||
};
|
};
|
||||||
|
@ -206,8 +194,7 @@ lib.mapAttrsToList
|
||||||
*/
|
*/
|
||||||
cert_expiry = {
|
cert_expiry = {
|
||||||
condition = "(probe_ssl_earliest_cert_expiry - time())/(3600*24) < 30";
|
condition = "(probe_ssl_earliest_cert_expiry - time())/(3600*24) < 30";
|
||||||
description =
|
description = "{{$labels.instance}}: The TLS certificate will expire in less than 30 days: {{$value}}s";
|
||||||
"{{$labels.instance}}: The TLS certificate will expire in less than 30 days: {{$value}}s";
|
|
||||||
};
|
};
|
||||||
|
|
||||||
# ignore devices that disabled S.M.A.R.T (example if attached via USB)
|
# ignore devices that disabled S.M.A.R.T (example if attached via USB)
|
||||||
|
@ -223,7 +210,8 @@ lib.mapAttrsToList
|
||||||
description = "{{$labels.instance}}: OOM kill detected";
|
description = "{{$labels.instance}}: OOM kill detected";
|
||||||
};
|
};
|
||||||
|
|
||||||
/* unusual_disk_read_latency = {
|
/*
|
||||||
|
unusual_disk_read_latency = {
|
||||||
condition =
|
condition =
|
||||||
"rate(diskio_read_time[1m]) / rate(diskio_reads[1m]) > 0.1 and rate(diskio_reads[1m]) > 0";
|
"rate(diskio_read_time[1m]) / rate(diskio_reads[1m]) > 0.1 and rate(diskio_reads[1m]) > 0";
|
||||||
description = ''
|
description = ''
|
||||||
|
@ -242,8 +230,7 @@ lib.mapAttrsToList
|
||||||
|
|
||||||
host_memory_under_memory_pressure = {
|
host_memory_under_memory_pressure = {
|
||||||
condition = "rate(node_vmstat_pgmajfault[1m]) > 1000";
|
condition = "rate(node_vmstat_pgmajfault[1m]) > 1000";
|
||||||
description =
|
description = "{{$labels.instance}}: The node is under heavy memory pressure. High rate of major page faults: {{$value}}";
|
||||||
"{{$labels.instance}}: The node is under heavy memory pressure. High rate of major page faults: {{$value}}";
|
|
||||||
};
|
};
|
||||||
|
|
||||||
# ext4_errors = {
|
# ext4_errors = {
|
||||||
|
|
|
@ -84,12 +84,16 @@
|
||||||
];
|
];
|
||||||
|
|
||||||
ruleFiles = [
|
ruleFiles = [
|
||||||
(pkgs.writeText "prometheus-rules.yml" (builtins.toJSON {
|
(pkgs.writeText "prometheus-rules.yml" (
|
||||||
groups = [{
|
builtins.toJSON {
|
||||||
|
groups = [
|
||||||
|
{
|
||||||
name = "alerting-rules";
|
name = "alerting-rules";
|
||||||
rules = import ./alert-rules.nix { inherit lib; };
|
rules = import ./alert-rules.nix { inherit lib; };
|
||||||
}];
|
}
|
||||||
}))
|
];
|
||||||
|
}
|
||||||
|
))
|
||||||
];
|
];
|
||||||
|
|
||||||
alertmanagers = [ { static_configs = [ { targets = [ "localhost:9093" ]; } ]; } ];
|
alertmanagers = [ { static_configs = [ { targets = [ "localhost:9093" ]; } ]; } ];
|
||||||
|
@ -109,10 +113,12 @@
|
||||||
repeat_interval = "24h";
|
repeat_interval = "24h";
|
||||||
};
|
};
|
||||||
|
|
||||||
receivers = [{
|
receivers = [
|
||||||
|
{
|
||||||
name = "all";
|
name = "all";
|
||||||
# Email config documentation: https://prometheus.io/docs/alerting/latest/configuration/#email_config
|
# Email config documentation: https://prometheus.io/docs/alerting/latest/configuration/#email_config
|
||||||
email_configs = [{
|
email_configs = [
|
||||||
|
{
|
||||||
send_resolved = true;
|
send_resolved = true;
|
||||||
to = "TODO";
|
to = "TODO";
|
||||||
from = "alerts@pub.solar";
|
from = "alerts@pub.solar";
|
||||||
|
@ -120,11 +126,13 @@
|
||||||
auth_username = "TODO";
|
auth_username = "TODO";
|
||||||
auth_password_file = "${config.age.secrets.nachtigall-alertmanager-smtp-password.path}";
|
auth_password_file = "${config.age.secrets.nachtigall-alertmanager-smtp-password.path}";
|
||||||
require_tls = true;
|
require_tls = true;
|
||||||
}];
|
}
|
||||||
|
];
|
||||||
# TODO:
|
# TODO:
|
||||||
# For matrix notifications, look into: https://github.com/pinpox/matrix-hook and add a webhook
|
# For matrix notifications, look into: https://github.com/pinpox/matrix-hook and add a webhook
|
||||||
# webhook_configs = [ { url = "http://127.0.0.1:11000/alert"; } ];
|
# webhook_configs = [ { url = "http://127.0.0.1:11000/alert"; } ];
|
||||||
}];
|
}
|
||||||
|
];
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
Loading…
Reference in a new issue