python3Packages.tensorflow: fix `GLIBCXX_3.4.30' not found

Make tensorflow (and a bunch of ther things) use CUDA-compatible toolchain. Introduces cudaPackages.backendStdenv
2023-02-27 16:28:07 +02:00 · 2023-02-27 16:28:07 +02:00 · 5f4bdbe6c3
parent d378cc6fb2
commit 5f4bdbe6c3
8 changed files with 88 additions and 69 deletions
--- a/pkgs/development/compilers/cudatoolkit/common.nix
+++ b/pkgs/development/compilers/cudatoolkit/common.nix
@ -11,7 +11,7 @@ args@
 , fetchurl
 , fontconfig
 , freetype
-, gcc
+, gcc # :: String
 , gdk-pixbuf
 , glib
 , glibc
@ -22,13 +22,13 @@ args@
 , perl
 , python3
 , requireFile
-, stdenv
+, backendStdenv # E.g. gcc11Stdenv, set in extension.nix
 , unixODBC
 , xorg
 , zlib
 }:

-stdenv.mkDerivation rec {
+backendStdenv.mkDerivation rec {
  pname = "cudatoolkit";
  inherit version runPatches;

@ -146,37 +146,24 @@ stdenv.mkDerivation rec {

    # Fix builds with newer glibc version
    sed -i "1 i#define _BITS_FLOATN_H" "$out/include/host_defines.h"
-
-    # Ensure that cmake can find CUDA.
+  '' +
+  # Point NVCC at a compatible compiler
+  # FIXME: redist cuda_nvcc copy-pastes this code
+  # Refer to comments in the overrides for cuda_nvcc for explanation
+  # CUDA_TOOLKIT_ROOT_DIR is legacy,
+  # Cf. https://cmake.org/cmake/help/latest/module/FindCUDA.html#input-variables
+  ''
    mkdir -p $out/nix-support
-    echo "cmakeFlags+=' -DCUDA_TOOLKIT_ROOT_DIR=$out'" >> $out/nix-support/setup-hook
-
-    # Set the host compiler to be used by nvcc.
-    # FIXME: redist cuda_nvcc copy-pastes this code
-
-    # For CMake-based projects:
-    # https://cmake.org/cmake/help/latest/module/FindCUDA.html#input-variables
-    # https://cmake.org/cmake/help/latest/envvar/CUDAHOSTCXX.html
-    # https://cmake.org/cmake/help/latest/variable/CMAKE_CUDA_HOST_COMPILER.html
-
-    # For non-CMake projects:
-    # FIXME: results in "incompatible redefinition" warnings ...but we keep
-    # both this and cmake variables until we come up with a more general
-    # solution
-    # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#compiler-bindir-directory-ccbin
-
    cat <<EOF >> $out/nix-support/setup-hook
-
-    cmakeFlags+=' -DCUDA_HOST_COMPILER=${gcc}/bin'
-    cmakeFlags+=' -DCMAKE_CUDA_HOST_COMPILER=${gcc}/bin'
+    cmakeFlags+=' -DCUDA_TOOLKIT_ROOT_DIR=$out'
+    cmakeFlags+=' -DCUDA_HOST_COMPILER=${backendStdenv.cc}/bin'
+    cmakeFlags+=' -DCMAKE_CUDA_HOST_COMPILER=${backendStdenv.cc}/bin'
    if [ -z "\''${CUDAHOSTCXX-}" ]; then
-      export CUDAHOSTCXX=${gcc}/bin;
+      export CUDAHOSTCXX=${backendStdenv.cc}/bin;
    fi
-
-    export NVCC_PREPEND_FLAGS+=' --compiler-bindir=${gcc}/bin'
+    export NVCC_PREPEND_FLAGS+=' --compiler-bindir=${backendStdenv.cc}/bin'
    EOF

-
    # Move some libraries to the lib output so that programs that
    # depend on them don't pull in this entire monstrosity.
    mkdir -p $lib/lib
@ -212,11 +199,10 @@ stdenv.mkDerivation rec {

      # The path to libstdc++ and such
      #
-      # NB:
-      # 1. "gcc" (gcc-wrapper) here is what's exposed as cudaPackages.cudatoolkit.cc
-      # 2. "gcc.cc" is the unwrapped gcc
-      # 3. "gcc.cc.lib" is one of its outputs
-      "${gcc.cc.lib}/lib64"
+      # `backendStdenv` is the cuda-compatible toolchain that we pick in
+      # extension.nix; we hand it to NVCC to use as a back-end, and we link
+      # cudatoolkit's binaries against its libstdc++
+      "${backendStdenv.cc.cc.lib}/lib64"

      "$out/jre/lib/amd64/jli"
      "$out/lib64"
@ -286,7 +272,7 @@ stdenv.mkDerivation rec {
    popd
  '';
  passthru = {
-    cc = gcc;
+    cc = backendStdenv.cc;
    majorMinorVersion = lib.versions.majorMinor version;
    majorVersion = lib.versions.majorMinor version;
  };
--- a/pkgs/development/compilers/cudatoolkit/extension.nix
+++ b/pkgs/development/compilers/cudatoolkit/extension.nix
@ -7,11 +7,24 @@ final: prev: let
  # Version info for the classic cudatoolkit packages that contain everything that is in redist.
  cudatoolkitVersions = final.lib.importTOML ./versions.toml;

+  finalVersion = cudatoolkitVersions.${final.cudaVersion};
+
+  # Exposed as cudaPackages.backendStdenv.
+  # We don't call it just "stdenv" to avoid confusion: e.g. this toolchain doesn't contain nvcc.
+  # Instead, it's the back-end toolchain for nvcc to use.
+  # We also use this to link a compatible libstdc++ (backendStdenv.cc.cc.lib)
+  # Cf. https://github.com/NixOS/nixpkgs/pull/218265 for context
+  backendStdenv = prev.pkgs."${finalVersion.gcc}Stdenv";
+
  ### Add classic cudatoolkit package
-  cudatoolkit = buildCudaToolkitPackage ((attrs: attrs // { gcc = prev.pkgs.${attrs.gcc}; }) cudatoolkitVersions.${final.cudaVersion});
+  cudatoolkit = buildCudaToolkitPackage (finalVersion // { inherit backendStdenv; });

  cudaFlags = final.callPackage ./flags.nix {};

-in {
-  inherit cudatoolkit cudaFlags;
+in
+{
+  inherit
+    backendStdenv
+    cudatoolkit
+    cudaFlags;
 }
--- a/pkgs/development/compilers/cudatoolkit/redist/build-cuda-redist-package.nix
+++ b/pkgs/development/compilers/cudatoolkit/redist/build-cuda-redist-package.nix
@ -1,5 +1,5 @@
 { lib
-, stdenv
+, backendStdenv
 , fetchurl
 , autoPatchelfHook
 , autoAddOpenGLRunpathHook
@ -11,7 +11,7 @@ attrs:
 let
  arch = "linux-x86_64";
 in
-stdenv.mkDerivation {
+backendStdenv.mkDerivation {
  inherit pname;
  inherit (attrs) version;

@ -33,11 +33,8 @@ stdenv.mkDerivation {
    # autoPatchelfHook will search for a libstdc++ and we're giving it a
    # "compatible" libstdc++ from the same toolchain that NVCC uses.
    #
-    # E.g. it might happen that stdenv=gcc12Stdenv, but we build against cuda11
-    # that only "supports" gcc11. Linking against gcc12's libraries we might
-    # sometimes actually sometimes encounter dynamic linkage errors at runtime
    # NB: We don't actually know if this is the right thing to do
-    cudatoolkit.cc.cc.lib
+    backendStdenv.cc.cc.lib
  ];

  dontBuild = true;
@ -51,7 +48,7 @@ stdenv.mkDerivation {
    runHook postInstall
  '';

-  passthru.stdenv = stdenv;
+  passthru.stdenv = backendStdenv;

  meta = {
    description = attrs.name;
--- a/pkgs/development/compilers/cudatoolkit/redist/overrides.nix
+++ b/pkgs/development/compilers/cudatoolkit/redist/overrides.nix
@ -24,7 +24,7 @@ in

  cuda_nvcc = prev.cuda_nvcc.overrideAttrs (oldAttrs:
    let
-      inherit (prev.cudatoolkit) cc;
+      inherit (prev.backendStdenv) cc;
    in
    {
      # Point NVCC at a compatible compiler
@ -44,7 +44,6 @@ in
      postInstall = (oldAttrs.postInstall or "") + ''
        mkdir -p $out/nix-support
        cat <<EOF >> $out/nix-support/setup-hook
-        cmakeFlags+=' -DCUDA_TOOLKIT_ROOT_DIR=$out'
        cmakeFlags+=' -DCUDA_HOST_COMPILER=${cc}/bin'
        cmakeFlags+=' -DCMAKE_CUDA_HOST_COMPILER=${cc}/bin'
        if [ -z "\''${CUDAHOSTCXX-}" ]; then
--- a/pkgs/development/libraries/science/math/cudnn/generic.nix
+++ b/pkgs/development/libraries/science/math/cudnn/generic.nix
@ -1,11 +1,11 @@
 {
-  stdenv,
+  backendStdenv,
  lib,
  zlib,
  useCudatoolkitRunfile ? false,
  cudaVersion,
  cudaMajorVersion,
-  cudatoolkit, # if cuda>=11: only used for .cc
+  cudatoolkit, # For cuda < 11
  libcublas ? null, # cuda <11 doesn't ship redist packages
  autoPatchelfHook,
  autoAddOpenGLRunpathHook,
@ -26,7 +26,7 @@
  maxCudaVersion,
 }:
 assert useCudatoolkitRunfile || (libcublas != null); let
-  inherit (cudatoolkit) cc;
+  inherit (backendStdenv) cc;
  inherit (lib) lists strings trivial versions;

  # majorMinorPatch :: String -> String
@ -46,7 +46,7 @@ assert useCudatoolkitRunfile || (libcublas != null); let
    then cudatoolkit
    else libcublas;
 in
-  stdenv.mkDerivation {
+  backendStdenv.mkDerivation {
    pname = "cudatoolkit-${cudaMajorVersion}-cudnn";
    version = versionTriple;

--- a/pkgs/development/libraries/science/math/tensorrt/generic.nix
+++ b/pkgs/development/libraries/science/math/tensorrt/generic.nix
@ -1,5 +1,5 @@
 { lib
-, stdenv
+, backendStdenv
 , requireFile
 , autoPatchelfHook
 , autoAddOpenGLRunpathHook
@ -18,7 +18,7 @@
 assert lib.assertMsg (lib.strings.versionAtLeast cudnn.version fileVersionCudnn)
  "This version of TensorRT requires at least cuDNN ${fileVersionCudnn} (current version is ${cudnn.version})";

-stdenv.mkDerivation rec {
+backendStdenv.mkDerivation rec {
  pname = "cudatoolkit-${cudatoolkit.majorVersion}-tensorrt";
  version = fullVersion;
  src = requireFile rec {
@ -45,7 +45,7 @@ stdenv.mkDerivation rec {

  # Used by autoPatchelfHook
  buildInputs = [
-    cudatoolkit.cc.cc.lib # libstdc++
+    backendStdenv.cc.cc.lib # libstdc++
    cudatoolkit
    cudnn
  ];
@ -74,6 +74,8 @@ stdenv.mkDerivation rec {
        "$out/lib/libnvinfer_builder_resource.so.${mostOfVersion}"
    '';

+  passthru.stdenv = backendStdenv;
+
  meta = with lib; {
    # Check that the cudatoolkit version satisfies our min/max constraints (both
    # inclusive). We mark the package as broken if it fails to satisfies the
--- a/pkgs/development/python-modules/tensorflow/default.nix
+++ b/pkgs/development/python-modules/tensorflow/default.nix
@ -32,6 +32,26 @@
 }:

 let
+  originalStdenv = stdenv;
+in
+let
+  # Tensorflow looks at many toolchain-related variables which may diverge.
+  #
+  # Toolchain for cuda-enabled builds.
+  # We want to achieve two things:
+  # 1. NVCC should use a compatible back-end (e.g. gcc11 for cuda11)
+  # 2. Normal C++ files should be compiled with the same toolchain,
+  #    to avoid potential weird dynamic linkage errors at runtime.
+  #    This may not be necessary though
+  #
+  # Toolchain for Darwin:
+  # clang 7 fails to emit a symbol for
+  # __ZN4llvm11SmallPtrSetIPKNS_10AllocaInstELj8EED1Ev in any of the
+  # translation units, so the build fails at link time
+  stdenv =
+    if cudaSupport then cudaPackages.backendStdenv
+    else if originalStdenv.isDarwin then llvmPackages_11.stdenv
+    else originalStdenv;
  inherit (cudaPackages) cudatoolkit cudnn nccl;
 in

@ -44,6 +64,7 @@ assert ! (stdenv.isDarwin && cudaSupport);
 let
  withTensorboard = (pythonOlder "3.6") || tensorboardSupport;

+  # FIXME: migrate to redist cudaPackages
  cudatoolkit_joined = symlinkJoin {
    name = "${cudatoolkit.name}-merged";
    paths = [
@ -56,10 +77,13 @@ let
    ];
  };

+  # Tensorflow expects bintools at hard-coded paths, e.g. /usr/bin/ar
+  # The only way to overcome that is to set GCC_HOST_COMPILER_PREFIX,
+  # but that path must contain cc as well, so we merge them
  cudatoolkit_cc_joined = symlinkJoin {
-    name = "${cudatoolkit.cc.name}-merged";
+    name = "${stdenv.cc.name}-merged";
    paths = [
-      cudatoolkit.cc
+      stdenv.cc
      binutils.bintools # for ar, dwp, nm, objcopy, objdump, strip
    ];
  };
@ -175,12 +199,7 @@ let
    '';
  }) else _bazel-build;

-  _bazel-build = (buildBazelPackage.override (lib.optionalAttrs stdenv.isDarwin {
-    # clang 7 fails to emit a symbol for
-    # __ZN4llvm11SmallPtrSetIPKNS_10AllocaInstELj8EED1Ev in any of the
-    # translation units, so the build fails at link time
-    stdenv = llvmPackages_11.stdenv;
-  })) {
+  _bazel-build = buildBazelPackage.override { inherit stdenv; } {
    name = "${pname}-${version}";
    bazel = bazel_5;

@ -211,12 +230,13 @@ let
      flatbuffers-core
      giflib
      grpc
-      icu
+      # Necessary to fix the "`GLIBCXX_3.4.30' not found" error
+      (icu.override { inherit stdenv; })
      jsoncpp
      libjpeg_turbo
      libpng
      lmdb-core
-      pybind11
+      (pybind11.overridePythonAttrs (_: { inherit stdenv; }))
      snappy
      sqlite
    ] ++ lib.optionals cudaSupport [
@ -301,10 +321,12 @@ let

    TF_NEED_CUDA = tfFeature cudaSupport;
    TF_CUDA_PATHS = lib.optionalString cudaSupport "${cudatoolkit_joined},${cudnn},${nccl}";
-    GCC_HOST_COMPILER_PREFIX = lib.optionalString cudaSupport "${cudatoolkit_cc_joined}/bin";
-    GCC_HOST_COMPILER_PATH = lib.optionalString cudaSupport "${cudatoolkit_cc_joined}/bin/gcc";
    TF_CUDA_COMPUTE_CAPABILITIES = lib.concatStringsSep "," cudaCapabilities;

+    # Needed even when we override stdenv: e.g. for ar
+    GCC_HOST_COMPILER_PREFIX = lib.optionalString cudaSupport "${cudatoolkit_cc_joined}/bin";
+    GCC_HOST_COMPILER_PATH = lib.optionalString cudaSupport "${cudatoolkit_cc_joined}/bin/cc";
+
    postPatch = ''
      # bazel 3.3 should work just as well as bazel 3.1
      rm -f .bazelversion
--- a/pkgs/test/cuda/cuda-library-samples/generic.nix
+++ b/pkgs/test/cuda/cuda-library-samples/generic.nix
@ -1,4 +1,4 @@
-{ lib, stdenv, fetchFromGitHub
+{ lib, backendStdenv, fetchFromGitHub
 , cmake, addOpenGLRunpath
 , cudatoolkit
 , cutensor
@ -35,13 +35,13 @@ let
 in

 {
-  cublas = stdenv.mkDerivation (commonAttrs // {
+  cublas = backendStdenv.mkDerivation (commonAttrs // {
    pname = "cuda-library-samples-cublas";

    src = "${src}/cuBLASLt";
  });

-  cusolver = stdenv.mkDerivation (commonAttrs // {
+  cusolver = backendStdenv.mkDerivation (commonAttrs // {
    pname = "cuda-library-samples-cusolver";

    src = "${src}/cuSOLVER";
@ -49,7 +49,7 @@ in
    sourceRoot = "cuSOLVER/gesv";
  });

-  cutensor = stdenv.mkDerivation (commonAttrs // {
+  cutensor = backendStdenv.mkDerivation (commonAttrs // {
    pname = "cuda-library-samples-cutensor";

    src = "${src}/cuTENSOR";