forked from pub-solar/infra
style: treefmt with nixfmt-rfc-style
This commit is contained in:
parent
11f5557a7a
commit
a8a8155114
|
@ -52,11 +52,9 @@ lib.mapAttrsToList
|
|||
# };
|
||||
|
||||
filesystem_full_80percent = {
|
||||
condition = ''
|
||||
100 - ((node_filesystem_avail_bytes{fstype!="rootfs",mountpoint="/"} * 100) / node_filesystem_size_bytes{fstype!="rootfs",mountpoint="/"}) > 80'';
|
||||
condition = ''100 - ((node_filesystem_avail_bytes{fstype!="rootfs",mountpoint="/"} * 100) / node_filesystem_size_bytes{fstype!="rootfs",mountpoint="/"}) > 80'';
|
||||
time = "10m";
|
||||
description =
|
||||
"{{$labels.instance}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 20% space left on its filesystem.";
|
||||
description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 20% space left on its filesystem.";
|
||||
};
|
||||
|
||||
# filesystem_inodes_full = {
|
||||
|
@ -105,17 +103,14 @@ lib.mapAttrsToList
|
|||
# };
|
||||
|
||||
swap_using_20percent = {
|
||||
condition =
|
||||
"node_memory_SwapTotal_bytes - (node_memory_SwapCached_bytes + node_memory_SwapFree_bytes) > node_memory_SwapTotal_bytes * 0.2";
|
||||
condition = "node_memory_SwapTotal_bytes - (node_memory_SwapCached_bytes + node_memory_SwapFree_bytes) > node_memory_SwapTotal_bytes * 0.2";
|
||||
time = "30m";
|
||||
description =
|
||||
"{{$labels.instance}} is using 20% of its swap space for at least 30 minutes.";
|
||||
description = "{{$labels.instance}} is using 20% of its swap space for at least 30 minutes.";
|
||||
};
|
||||
|
||||
systemd_service_failed = {
|
||||
condition = ''node_systemd_unit_state{state="failed"} == 1'';
|
||||
description =
|
||||
"{{$labels.instance}} failed to (re)start service {{$labels.name}}.";
|
||||
description = "{{$labels.instance}} failed to (re)start service {{$labels.name}}.";
|
||||
};
|
||||
|
||||
restic_backup_too_old = {
|
||||
|
@ -134,19 +129,15 @@ lib.mapAttrsToList
|
|||
# };
|
||||
|
||||
ram_using_90percent = {
|
||||
condition =
|
||||
"node_memory_Buffers_bytes + node_memory_MemFree_bytes + node_memory_Cached_bytes < node_memory_MemTotal_bytes * 0.1";
|
||||
condition = "node_memory_Buffers_bytes + node_memory_MemFree_bytes + node_memory_Cached_bytes < node_memory_MemTotal_bytes * 0.1";
|
||||
time = "1h";
|
||||
description =
|
||||
"{{$labels.instance}} is using at least 90% of its RAM for at least 1 hour.";
|
||||
description = "{{$labels.instance}} is using at least 90% of its RAM for at least 1 hour.";
|
||||
};
|
||||
|
||||
cpu_using_90percent = {
|
||||
condition = ''
|
||||
100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) >= 90'';
|
||||
condition = ''100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) >= 90'';
|
||||
time = "10m";
|
||||
description =
|
||||
"{{$labels.instance}} is running with cpu usage > 90% for at least 10 minutes: {{$value}}";
|
||||
description = "{{$labels.instance}} is running with cpu usage > 90% for at least 10 minutes: {{$value}}";
|
||||
};
|
||||
|
||||
reboot = {
|
||||
|
@ -156,18 +147,16 @@ lib.mapAttrsToList
|
|||
|
||||
uptime = {
|
||||
condition = "(time() - node_boot_time_seconds ) / (60*60*24) > 30";
|
||||
description =
|
||||
"Uptime monster: {{$labels.instance}} has been up for more than 30 days.";
|
||||
description = "Uptime monster: {{$labels.instance}} has been up for more than 30 days.";
|
||||
};
|
||||
|
||||
flake_nixpkgs_outdated = {
|
||||
condition = ''
|
||||
(time() - flake_input_last_modified{input="nixpkgs"}) / (60*60*24) > 30'';
|
||||
description =
|
||||
"Nixpkgs outdated: Nixpkgs on {{$labels.instance}} has not been updated in 30 days";
|
||||
condition = ''(time() - flake_input_last_modified{input="nixpkgs"}) / (60*60*24) > 30'';
|
||||
description = "Nixpkgs outdated: Nixpkgs on {{$labels.instance}} has not been updated in 30 days";
|
||||
};
|
||||
|
||||
/* ping = {
|
||||
/*
|
||||
ping = {
|
||||
condition = "ping_result_code{type!='mobile'} != 0";
|
||||
description = "{{$labels.url}}: ping from {{$labels.instance}} has failed!";
|
||||
};
|
||||
|
@ -178,12 +167,11 @@ lib.mapAttrsToList
|
|||
};
|
||||
*/
|
||||
http_status = {
|
||||
condition = ''
|
||||
probe_http_status_code{instance!~"https://megaclan3000.de"} != 200'';
|
||||
description =
|
||||
"http request failed from {{$labels.instance}}: {{$labels.result}}!";
|
||||
condition = ''probe_http_status_code{instance!~"https://megaclan3000.de"} != 200'';
|
||||
description = "http request failed from {{$labels.instance}}: {{$labels.result}}!";
|
||||
};
|
||||
/* http_match_failed = {
|
||||
/*
|
||||
http_match_failed = {
|
||||
condition = "http_response_response_string_match == 0";
|
||||
description = "{{$labels.server}} : http body not as expected; status code: {{$labels.status_code}}!";
|
||||
};
|
||||
|
@ -206,8 +194,7 @@ lib.mapAttrsToList
|
|||
*/
|
||||
cert_expiry = {
|
||||
condition = "(probe_ssl_earliest_cert_expiry - time())/(3600*24) < 30";
|
||||
description =
|
||||
"{{$labels.instance}}: The TLS certificate will expire in less than 30 days: {{$value}}s";
|
||||
description = "{{$labels.instance}}: The TLS certificate will expire in less than 30 days: {{$value}}s";
|
||||
};
|
||||
|
||||
# ignore devices that disabled S.M.A.R.T (example if attached via USB)
|
||||
|
@ -223,7 +210,8 @@ lib.mapAttrsToList
|
|||
description = "{{$labels.instance}}: OOM kill detected";
|
||||
};
|
||||
|
||||
/* unusual_disk_read_latency = {
|
||||
/*
|
||||
unusual_disk_read_latency = {
|
||||
condition =
|
||||
"rate(diskio_read_time[1m]) / rate(diskio_reads[1m]) > 0.1 and rate(diskio_reads[1m]) > 0";
|
||||
description = ''
|
||||
|
@ -242,8 +230,7 @@ lib.mapAttrsToList
|
|||
|
||||
host_memory_under_memory_pressure = {
|
||||
condition = "rate(node_vmstat_pgmajfault[1m]) > 1000";
|
||||
description =
|
||||
"{{$labels.instance}}: The node is under heavy memory pressure. High rate of major page faults: {{$value}}";
|
||||
description = "{{$labels.instance}}: The node is under heavy memory pressure. High rate of major page faults: {{$value}}";
|
||||
};
|
||||
|
||||
# ext4_errors = {
|
||||
|
|
|
@ -84,15 +84,19 @@
|
|||
];
|
||||
|
||||
ruleFiles = [
|
||||
(pkgs.writeText "prometheus-rules.yml" (builtins.toJSON {
|
||||
groups = [{
|
||||
(pkgs.writeText "prometheus-rules.yml" (
|
||||
builtins.toJSON {
|
||||
groups = [
|
||||
{
|
||||
name = "alerting-rules";
|
||||
rules = import ./alert-rules.nix { inherit lib; };
|
||||
}];
|
||||
}))
|
||||
}
|
||||
];
|
||||
}
|
||||
))
|
||||
];
|
||||
|
||||
alertmanagers = [{ static_configs = [{ targets = [ "localhost:9093" ]; }]; }];
|
||||
alertmanagers = [ { static_configs = [ { targets = [ "localhost:9093" ]; } ]; } ];
|
||||
|
||||
alertmanager = {
|
||||
enable = true;
|
||||
|
@ -109,10 +113,12 @@
|
|||
repeat_interval = "24h";
|
||||
};
|
||||
|
||||
receivers = [{
|
||||
receivers = [
|
||||
{
|
||||
name = "all";
|
||||
# Email config documentation: https://prometheus.io/docs/alerting/latest/configuration/#email_config
|
||||
email_configs = [{
|
||||
email_configs = [
|
||||
{
|
||||
send_resolved = true;
|
||||
to = "TODO";
|
||||
from = "alerts@pub.solar";
|
||||
|
@ -120,11 +126,13 @@
|
|||
auth_username = "TODO";
|
||||
auth_password_file = "${config.age.secrets.nachtigall-alertmanager-smtp-password.path}";
|
||||
require_tls = true;
|
||||
}];
|
||||
}
|
||||
];
|
||||
# TODO:
|
||||
# For matrix notifications, look into: https://github.com/pinpox/matrix-hook and add a webhook
|
||||
# webhook_configs = [ { url = "http://127.0.0.1:11000/alert"; } ];
|
||||
}];
|
||||
}
|
||||
];
|
||||
};
|
||||
};
|
||||
};
|
||||
|
|
Loading…
Reference in a new issue