module/prometheus: optionally support reloading on config changes

The new option `services.prometheus.enableReload` has been introduced
which, when enabled, causes the prometheus systemd service to reload
when its config file changes.

More specifically the following property holds: switching to a
configuration (`switch-to-configuration`) that changes the prometheus
configuration only finishes successully when prometheus has finished
loading the new configuration.

`enableReload` is `false` by default in which case the old semantics
of restarting the prometheus systemd service are in effect.
This commit is contained in:
Bas van Dijk 2021-10-27 14:18:00 +02:00 committed by Bas van Dijk
parent 8d5213123a
commit f12e976ade
4 changed files with 204 additions and 3 deletions

View file

@ -1715,6 +1715,14 @@ Superuser created successfully.
better user experience and benefit from this change.
</para>
</listitem>
<listitem>
<para>
A new option
<literal>services.prometheus.enableReload</literal> has been
added which can be enabled to reload the prometheus service
when its config file changes instead of restarting.
</para>
</listitem>
<listitem>
<para>
Dokuwiki now supports caddy! However

View file

@ -487,6 +487,8 @@ In addition to numerous new and upgraded packages, this release has the followin
- The `cawbird` Twitter client now uses its own API keys to count as different application than upstream builds. This is done to evade application-level rate limiting. While existing accounts continue to work, users may want to remove and re-register their account in the client to enjoy a better user experience and benefit from this change.
- A new option `services.prometheus.enableReload` has been added which can be enabled to reload the prometheus service when its config file changes instead of restarting.
- Dokuwiki now supports caddy! However
- the nginx option has been removed, in the new configuration, please use the `dokuwiki.webserver = "nginx"` instead.
- The "${hostname}" option has been deprecated, please use `dokuwiki.sites = [ "${hostname}" ]` instead

View file

@ -7,6 +7,30 @@ let
workingDir = "/var/lib/" + cfg.stateDir;
prometheusYmlOut = "${workingDir}/prometheus-substituted.yaml";
writeConfig = pkgs.writeShellScriptBin "write-prometheus-config" ''
PATH="${makeBinPath (with pkgs; [ coreutils envsubst ])}"
touch '${prometheusYmlOut}'
chmod 600 '${prometheusYmlOut}'
envsubst -o '${prometheusYmlOut}' -i '${prometheusYml}'
'';
triggerReload = pkgs.writeShellScriptBin "trigger-reload-prometheus" ''
PATH="${makeBinPath (with pkgs; [ systemd ])}"
if systemctl -q is-active prometheus.service; then
systemctl reload prometheus.service
fi
'';
reload = pkgs.writeShellScriptBin "reload-prometheus" ''
PATH="${makeBinPath (with pkgs; [ systemd coreutils gnugrep ])}"
cursor=$(journalctl --show-cursor -n0 | grep -oP "cursor: \K.*")
kill -HUP $MAINPID
journalctl -u prometheus.service --after-cursor="$cursor" -f \
| grep -m 1 "Completed loading of configuration file" > /dev/null
'';
# a wrapper that verifies that the configuration is valid
promtoolCheck = what: name: file:
if cfg.checkConfig then
@ -47,7 +71,11 @@ let
cmdlineArgs = cfg.extraFlags ++ [
"--storage.tsdb.path=${workingDir}/data/"
"--config.file=/run/prometheus/prometheus-substituted.yaml"
"--config.file=${
if cfg.enableReload
then prometheusYmlOut
else "/run/prometheus/prometheus-substituted.yaml"
}"
"--web.listen-address=${cfg.listenAddress}:${builtins.toString cfg.port}"
"--alertmanager.notification-queue-capacity=${toString cfg.alertmanagerNotificationQueueCapacity}"
"--alertmanager.timeout=${toString cfg.alertmanagerTimeout}s"
@ -731,6 +759,25 @@ in {
'';
};
enableReload = mkOption {
default = false;
type = types.bool;
description = ''
Reload prometheus when configuration file changes (instead of restart).
The following property holds: switching to a configuration
(<literal>switch-to-configuration</literal>) that changes the prometheus
configuration only finishes successully when prometheus has finished
loading the new configuration.
Note that prometheus will also get reloaded when the location of the
<option>environmentFile</option> changes but not when its contents
changes. So when you change it contents make sure to reload prometheus
manually or include the hash of <option>environmentFile</option> in its
name.
'';
};
environmentFile = mkOption {
type = types.nullOr types.path;
default = null;
@ -928,7 +975,7 @@ in {
systemd.services.prometheus = {
wantedBy = [ "multi-user.target" ];
after = [ "network.target" ];
preStart = ''
preStart = mkIf (!cfg.enableReload) ''
${lib.getBin pkgs.envsubst}/bin/envsubst -o "/run/prometheus/prometheus-substituted.yaml" \
-i "${prometheusYml}"
'';
@ -936,9 +983,10 @@ in {
ExecStart = "${cfg.package}/bin/prometheus" +
optionalString (length cmdlineArgs != 0) (" \\\n " +
concatStringsSep " \\\n " cmdlineArgs);
ExecReload = mkIf cfg.enableReload "+${reload}/bin/reload-prometheus";
User = "prometheus";
Restart = "always";
EnvironmentFile = mkIf (cfg.environmentFile != null) [ cfg.environmentFile ];
EnvironmentFile = mkIf (cfg.environmentFile != null && !cfg.enableReload) [ cfg.environmentFile ];
RuntimeDirectory = "prometheus";
RuntimeDirectoryMode = "0700";
WorkingDirectory = workingDir;
@ -946,5 +994,48 @@ in {
StateDirectoryMode = "0700";
};
};
systemd.services.prometheus-config-write = mkIf cfg.enableReload {
wantedBy = [ "prometheus.service" ];
before = [ "prometheus.service" ];
serviceConfig = {
Type = "oneshot";
User = "prometheus";
StateDirectory = cfg.stateDir;
StateDirectoryMode = "0700";
EnvironmentFile = mkIf (cfg.environmentFile != null) [ cfg.environmentFile ];
ExecStart = "${writeConfig}/bin/write-prometheus-config";
};
};
# prometheus-config-reload will activate after prometheus. However, what we
# don't want is that on startup it immediately reloads prometheus because
# prometheus itself might have just started.
#
# Instead we only want to reload prometheus when the config file has
# changed. So on startup prometheus-config-reload will just output a
# harmless message and then stay active (RemainAfterExit).
#
# Then, when the config file has changed, switch-to-configuration notices
# that this service has changed and needs to be reloaded
# (reloadIfChanged). The reload command then actually writes the new config
# and reloads prometheus.
systemd.services.prometheus-config-reload = mkIf cfg.enableReload {
wantedBy = [ "prometheus.service" ];
after = [ "prometheus.service" ];
reloadIfChanged = true;
serviceConfig = {
Type = "oneshot";
User = "prometheus";
StateDirectory = cfg.stateDir;
StateDirectoryMode = "0700";
EnvironmentFile = mkIf (cfg.environmentFile != null) [ cfg.environmentFile ];
RemainAfterExit = true;
TimeoutSec = 60;
ExecStart = "${pkgs.logger}/bin/logger 'prometheus-config-reload will only reload prometheus when reloaded itself.'";
ExecReload = [
"${writeConfig}/bin/write-prometheus-config"
"+${triggerReload}/bin/trigger-reload-prometheus"
];
};
};
};
}

View file

@ -41,6 +41,7 @@ in import ./make-test-python.nix {
networking.firewall.allowedTCPPorts = [ grpcPort ];
services.prometheus = {
enable = true;
enableReload = true;
scrapeConfigs = [
{
job_name = "prometheus";
@ -118,6 +119,36 @@ in import ./make-test-python.nix {
# };
#};
};
# Adds a "specialisation" of the above config which allows us to
# "switch" to it and see if the services.prometheus.enableReload
# functionality actually reloads the prometheus service instead of
# restarting it.
specialisation = {
"prometheus-config-change" = {
configuration = {
environment.systemPackages = [ pkgs.yq ];
# This configuration just adds a new prometheus job
# to scrape the node_exporter metrics of the s3 machine.
# We also use an environmentFile to test if that works correctly.
services.prometheus = {
environmentFile = pkgs.writeText "prometheus-config-env-file" ''
JOB_NAME=s3-node_exporter
'';
scrapeConfigs = [
{
job_name = "$JOB_NAME";
static_configs = [
{
targets = [ "s3:9100" ];
}
];
}
];
};
};
};
};
};
query = { pkgs, ... }: {
@ -171,10 +202,17 @@ in import ./make-test-python.nix {
};
environment.systemPackages = [ pkgs.minio-client ];
services.prometheus.exporters.node = {
enable = true;
openFirewall = true;
};
};
};
testScript = { nodes, ... } : ''
import json
# Before starting the other machines we first make sure that our S3 service is online
# and has a bucket added for thanos:
s3.start()
@ -193,6 +231,12 @@ in import ./make-test-python.nix {
# Check if prometheus responds to requests:
prometheus.wait_for_unit("prometheus.service")
# Check if prometheus' config file is correctly locked down because it could contain secrets.
prometheus.succeed(
"stat -c '%a %U' /var/lib/prometheus2/prometheus-substituted.yaml | grep '600 prometheus'"
)
prometheus.wait_for_open_port(${toString queryPort})
prometheus.succeed("curl -sf http://127.0.0.1:${toString queryPort}/metrics")
@ -245,5 +289,61 @@ in import ./make-test-python.nix {
+ "jq .thanos.labels.some_label | "
+ "grep 'required by thanos'"
)
# Check if switching to a NixOS configuration that changes the prometheus
# configuration reloads (instead of restarts) prometheus before the switch
# finishes successfully:
with subtest("config change reloads prometheus"):
# We check if prometheus has finished reloading by looking for the message
# "Completed loading of configuration file" in the journal between the start
# and finish of switching to the new NixOS configuration.
#
# To mark the start we record the journal cursor before starting the switch:
cursor_before_switching = json.loads(
prometheus.succeed("journalctl -n1 -o json --output-fields=__CURSOR")
)["__CURSOR"]
# Now we switch:
prometheus_config_change = prometheus.succeed(
"readlink /run/current-system/specialisation/prometheus-config-change"
).strip()
prometheus.succeed(prometheus_config_change + "/bin/switch-to-configuration test")
# Next we retrieve all logs since the start of switching:
logs_after_starting_switching = prometheus.succeed(
"""
journalctl --after-cursor='{cursor_before_switching}' -o json --output-fields=MESSAGE
""".format(
cursor_before_switching=cursor_before_switching
)
)
# Finally we check if the message "Completed loading of configuration file"
# occurs before the "finished switching to system configuration" message:
finished_switching_msg = (
"finished switching to system configuration " + prometheus_config_change
)
reloaded_before_switching_finished = False
finished_switching = False
for log_line in logs_after_starting_switching.split("\n"):
msg = json.loads(log_line)["MESSAGE"]
if "Completed loading of configuration file" in msg:
reloaded_before_switching_finished = True
if msg == finished_switching_msg:
finished_switching = True
break
assert reloaded_before_switching_finished
assert finished_switching
# Check if the reloaded config includes the new s3-node_exporter job:
prometheus.succeed(
"""
curl -sf http://127.0.0.1:${toString queryPort}/api/v1/status/config \
| jq -r .data.yaml \
| yq '.scrape_configs | any(.job_name == "s3-node_exporter")' \
| grep true
"""
)
'';
}