nixos/slurm: add slurmdbd to module

* New options "services.slurm.dbdserver.[enable,config]"
* Add slurmdbd to test slurm.nix
This commit is contained in:
Markus Kowalewski 2018-09-15 13:09:36 +02:00
parent 111d4eb090
commit 79c9dbfb40
No known key found for this signature in database
GPG key ID: D865C8A91D7025EB
2 changed files with 99 additions and 7 deletions

View file

@ -29,12 +29,19 @@ let
${cfg.extraPlugstackConfig}
'';
cgroupConfig = pkgs.writeTextDir "cgroup.conf"
''
${cfg.extraCgroupConfig}
'';
slurmdbdConf = pkgs.writeTextDir "slurmdbd.conf"
''
DbdHost=${cfg.dbdserver.dbdHost}
SlurmUser=${cfg.user}
StorageType=accounting_storage/mysql
${cfg.dbdserver.extraConfig}
'';
# slurm expects some additional config files to be
# in the same directory as slurm.conf
etcSlurm = pkgs.symlinkJoin {
@ -65,6 +72,27 @@ in
};
};
dbdserver = {
enable = mkEnableOption "SlurmDBD service";
dbdHost = mkOption {
type = types.str;
default = config.networking.hostName;
description = ''
Hostname of the machine where <literal>slurmdbd</literal>
is running (i.e. name returned by <literal>hostname -s</literal>).
'';
};
extraConfig = mkOption {
type = types.lines;
default = "";
description = ''
Extra configuration for <literal>slurmdbd.conf</literal>
'';
};
};
client = {
enable = mkEnableOption "slurm client daemon";
};
@ -208,6 +236,8 @@ in
used when <literal>procTrackType=proctrack/cgroup</literal>.
'';
};
};
};
@ -244,7 +274,10 @@ in
'';
};
in mkIf (cfg.enableStools || cfg.client.enable || cfg.server.enable) {
in mkIf ( cfg.enableStools ||
cfg.client.enable ||
cfg.server.enable ||
cfg.dbdserver.enable ) {
environment.systemPackages = [ wrappedSlurm ];
@ -301,6 +334,24 @@ in
'';
};
systemd.services.slurmdbd = mkIf (cfg.dbdserver.enable) {
path = with pkgs; [ wrappedSlurm munge coreutils ];
wantedBy = [ "multi-user.target" ];
after = [ "network.target" "munged.service" "mysql.service" ];
requires = [ "munged.service" "mysql.service" ];
# slurm strips the last component off the path
environment.SLURM_CONF = "${slurmdbdConf}/slurm.conf";
serviceConfig = {
Type = "forking";
ExecStart = "${cfg.package}/bin/slurmdbd";
PIDFile = "/run/slurmdbd.pid";
ExecReload = "${pkgs.coreutils}/bin/kill -HUP $MAINPID";
};
};
};
}

View file

@ -1,5 +1,7 @@
import ./make-test.nix ({ ... }:
let mungekey = "mungeverryweakkeybuteasytointegratoinatest";
let
mungekey = "mungeverryweakkeybuteasytointegratoinatest";
slurmconfig = {
controlMachine = "control";
nodeName = ''
@ -7,6 +9,10 @@ let mungekey = "mungeverryweakkeybuteasytointegratoinatest";
NodeName=node[1-3] CPUs=1 State=UNKNOWN
'';
partitionName = "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP";
extraConfig = ''
AccountingStorageHost=dbd
AccountingStorageType=accounting_storage/slurmdbd
'';
};
in {
name = "slurm";
@ -16,7 +22,7 @@ in {
computeNode =
{ ...}:
{
# TODO slrumd port and slurmctld port should be configurations and
# TODO slurmd port and slurmctld port should be configurations and
# automatically allowed by the firewall.
networking.firewall.enable = false;
services.slurm = {
@ -43,6 +49,24 @@ in {
} // slurmconfig;
};
dbd =
{ pkgs, ... } :
{
networking.firewall.enable = false;
services.slurm.dbdserver = {
enable = true;
};
services.mysql = {
enable = true;
package = pkgs.mysql;
ensureDatabases = [ "slurm_acct_db" ];
ensureUsers = [{
ensurePermissions = { "slurm_acct_db.*" = "ALL PRIVILEGES"; };
name = "slurm";
}];
};
};
node1 = computeNode;
node2 = computeNode;
node3 = computeNode;
@ -54,7 +78,7 @@ in {
startAll;
# Set up authentification across the cluster
foreach my $node (($submit,$control,$node1,$node2,$node3))
foreach my $node (($submit,$control,$dbd,$node1,$node2,$node3))
{
$node->waitForUnit("default.target");
@ -63,10 +87,22 @@ in {
$node->succeed("chmod 0400 /etc/munge/munge.key");
$node->succeed("chown munge:munge /etc/munge/munge.key");
$node->succeed("systemctl restart munged");
}
$node->waitForUnit("munged");
};
# Restart the services since they have probably failed due to the munge init
# failure
subtest "can_start_slurmdbd", sub {
$dbd->succeed("systemctl restart slurmdbd");
$dbd->waitForUnit("slurmdbd.service");
};
# there needs to be an entry for the current
# cluster in the database before slurmctld is restarted
subtest "add_account", sub {
$control->succeed("sacctmgr -i add cluster default");
};
subtest "can_start_slurmctld", sub {
$control->succeed("systemctl restart slurmctld");
@ -81,12 +117,17 @@ in {
}
};
# Test that the cluster work and can distribute jobs;
# Test that the cluster works and can distribute jobs;
subtest "run_distributed_command", sub {
# Run `hostname` on 3 nodes of the partition (so on all the 3 nodes).
# The output must contain the 3 different names
$submit->succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq");
};
subtest "check_slurm_dbd", sub {
# find the srun job from above in the database
$submit->succeed("sacct | grep hostname");
};
'';
})