rocfft: split kernel compilation into separate derivations

To avoid output limit exceeded errors in hydra, we build kernel device
libs and the kernel RTC cache database in separate derivations
This commit is contained in:
Kira Bruneau 2023-05-17 10:16:42 -04:00
parent e88bc03e4b
commit c4a937a9f6
3 changed files with 235 additions and 16 deletions

View file

@ -1,22 +1,93 @@
{ lib
{ rocfft
, lib
, stdenv
, fetchFromGitHub
, rocmUpdateScript
, cmake
, rocm-cmake
, rocrand
, hip
, openmp
, sqlite
, python3
, gtest
, rocm-cmake
, sqlite
, boost
, fftw
, fftwFloat
, gtest
, openmp
, rocrand
, buildTests ? false
, buildBenchmarks ? false
# NOTE: Update the default GPU targets on every update
, gpuTargets ? [
"gfx803"
"gfx900"
"gfx906"
"gfx908"
"gfx90a"
"gfx1030"
"gfx1100"
"gfx1102"
]
}:
let
# To avoid output limit exceeded errors in hydra, we build kernel
# device libs and the kernel RTC cache database in separate derivations
kernelDeviceLibs = map
(target:
(rocfft.overrideAttrs (prevAttrs: {
pname = "rocfft-device-${target}";
patches = prevAttrs.patches ++ [
# Add back install rule for device library
# This workaround is needed because rocm_install_targets
# doesn't support an EXCLUDE_FROM_ALL option
./device-install.patch
];
buildFlags = [ "rocfft-device-${target}" ];
installPhase = ''
runHook preInstall
cmake --install . --component device
runHook postInstall
'';
requiredSystemFeatures = [ "big-parallel" ];
})).override {
buildTests = false;
buildBenchmarks = false;
gpuTargets = [ target ];
}
)
gpuTargets;
# TODO: Figure out how to also split this by GPU target
#
# It'll be bit more complicated than what we're doing for the kernel
# device libs, because the kernel cache needs to be compiled into
# one sqlite database (whereas the device libs can be linked into
# rocfft as separate libraries for each GPU target).
#
# It's not clear why this needs to even be a db in the first place.
# It would simplify things A LOT if we could just store these
# pre-compiled kernels as files (but that'd need a lot of patching).
kernelRtcCache = (rocfft.overrideAttrs (_: {
pname = "rocfft-kernel-cache";
buildFlags = [ "rocfft_kernel_cache_target" ];
installPhase = ''
runHook preInstall
cmake --install . --component kernel_cache
runHook postInstall
'';
requiredSystemFeatures = [ "big-parallel" ];
})).override {
buildTests = false;
buildBenchmarks = false;
};
in
stdenv.mkDerivation (finalAttrs: {
pname = "rocfft";
version = "5.4.3";
@ -36,23 +107,29 @@ stdenv.mkDerivation (finalAttrs: {
hash = "sha256-FsefE0B2hF5ZcHDB6TscwFeZ1NKFkWX7VDpEvvbDbOk=";
};
nativeBuildInputs = [
cmake
rocm-cmake
hip
patches = [
# Exclude kernel compilation & installation from "all" target,
# and split device libraries by GPU target
./split-kernel-compilation.patch
];
buildInputs = [
sqlite
nativeBuildInputs = [
cmake
hip
python3
rocm-cmake
];
buildInputs = (lib.optionals (finalAttrs.pname == "rocfft") kernelDeviceLibs) ++ [
sqlite
] ++ lib.optionals buildTests [
gtest
] ++ lib.optionals (buildTests || buildBenchmarks) [
rocrand
boost
fftw
fftwFloat
openmp
rocrand
];
propagatedBuildInputs = lib.optionals buildTests [
@ -70,6 +147,7 @@ stdenv.mkDerivation (finalAttrs: {
"-DCMAKE_INSTALL_BINDIR=bin"
"-DCMAKE_INSTALL_LIBDIR=lib"
"-DCMAKE_INSTALL_INCLUDEDIR=include"
"-DAMDGPU_TARGETS=${lib.concatStringsSep ";" gpuTargets}"
] ++ lib.optionals buildTests [
"-DBUILD_CLIENTS_TESTS=ON"
] ++ lib.optionals buildBenchmarks [
@ -77,7 +155,9 @@ stdenv.mkDerivation (finalAttrs: {
"-DBUILD_CLIENTS_SAMPLES=ON"
];
postInstall = lib.optionalString buildTests ''
postInstall = lib.optionalString (finalAttrs.pname == "rocfft") ''
ln -s ${kernelRtcCache}/lib/rocfft_kernel_cache.db "$out/lib"
'' + lib.optionalString buildTests ''
mkdir -p $test/{bin,lib/fftw}
cp -a $out/bin/* $test/bin
ln -s ${fftw}/lib/libfftw*.so $test/lib/fftw
@ -101,10 +181,10 @@ stdenv.mkDerivation (finalAttrs: {
};
meta = with lib; {
description = "FFT implementation for ROCm ";
description = "FFT implementation for ROCm";
homepage = "https://github.com/ROCmSoftwarePlatform/rocFFT";
license = with licenses; [ mit ];
maintainers = teams.rocm.members;
maintainers = with maintainers; [ kira-bruneau ] ++ teams.rocm.members;
platforms = platforms.linux;
broken = versions.minor finalAttrs.version != versions.minor hip.version;
};

View file

@ -0,0 +1,15 @@
diff --git a/library/src/device/CMakeLists.txt b/library/src/device/CMakeLists.txt
index 73a8ec9..9bfd4b8 100644
--- a/library/src/device/CMakeLists.txt
+++ b/library/src/device/CMakeLists.txt
@@ -255,4 +255,10 @@ foreach( sub ${AMDGPU_TARGETS} )
if( NOT BUILD_SHARED_LIBS )
set_target_properties( rocfft-device-${sub} PROPERTIES PREFIX "lib" )
endif( )
+
+ rocm_install_targets(
+ TARGETS
+ rocfft-device-${sub}
+ COMPONENT device
+ )
endforeach()

View file

@ -0,0 +1,124 @@
diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt
index 3a16304..606b711 100644
--- a/library/src/CMakeLists.txt
+++ b/library/src/CMakeLists.txt
@@ -250,12 +250,12 @@ foreach( target
endforeach()
-add_executable( rocfft_aot_helper
+add_executable( rocfft_aot_helper EXCLUDE_FROM_ALL
rocfft_aot_helper.cpp
rocfft_stub.cpp
)
-add_executable( rocfft_config_search
+add_executable( rocfft_config_search EXCLUDE_FROM_ALL
rocfft_config_search.cpp
rocfft_stub.cpp
)
@@ -279,10 +279,10 @@ endif()
target_link_libraries( rocfft PRIVATE ${ROCFFT_DEVICE_LINK_LIBS} )
-target_link_libraries( rocfft PRIVATE rocfft-device-0 )
-target_link_libraries( rocfft PRIVATE rocfft-device-1 )
-target_link_libraries( rocfft PRIVATE rocfft-device-2 )
-target_link_libraries( rocfft PRIVATE rocfft-device-3 )
+foreach( sub ${AMDGPU_TARGETS} )
+ target_link_libraries( rocfft PRIVATE -lrocfft-device-${sub} )
+endforeach()
+
foreach( target rocfft rocfft_aot_helper rocfft_config_search )
# RTC uses dladdr to find the RTC helper program
if( NOT WIN32 )
@@ -347,7 +347,7 @@ add_custom_command(
DEPENDS rocfft_aot_helper rocfft_rtc_helper
COMMENT "Compile kernels into shipped cache file"
)
-add_custom_target( rocfft_kernel_cache_target ALL
+add_custom_target( rocfft_kernel_cache_target
DEPENDS rocfft_kernel_cache.db
VERBATIM
)
@@ -392,7 +392,8 @@ else()
endif()
rocm_install(FILES ${ROCFFT_KERNEL_CACHE_PATH}
DESTINATION "${ROCFFT_KERNEL_CACHE_INSTALL_DIR}"
- COMPONENT runtime
+ COMPONENT kernel_cache
+ EXCLUDE_FROM_ALL
)
# PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ
diff --git a/library/src/device/CMakeLists.txt b/library/src/device/CMakeLists.txt
index 9f7b85f..73a8ec9 100644
--- a/library/src/device/CMakeLists.txt
+++ b/library/src/device/CMakeLists.txt
@@ -170,11 +170,11 @@ list( SORT rocfft_device_source )
# functions callable by rocFFT and depends on amdhip64, and another
# one usable by AOT RTC that contains no device code
list( FILTER rocfft_device_source EXCLUDE REGEX function_pool.cpp )
-add_library( rocfft-function-pool OBJECT
+add_library( rocfft-function-pool OBJECT EXCLUDE_FROM_ALL
function_pool.cpp
)
target_compile_definitions( rocfft-function-pool PRIVATE FUNCTION_POOL_STANDALONE_BODY= )
-add_library( rocfft-function-pool-standalone OBJECT
+add_library( rocfft-function-pool-standalone OBJECT EXCLUDE_FROM_ALL
function_pool.cpp
)
target_compile_definitions( rocfft-function-pool-standalone PRIVATE FUNCTION_POOL_STANDALONE_BODY={} )
@@ -193,26 +193,15 @@ foreach( pool rocfft-function-pool rocfft-function-pool-standalone )
add_dependencies(${pool} gen_headers_target)
endforeach()
-list( LENGTH rocfft_device_source rocfft_device_source_len )
-math(EXPR split_len "${rocfft_device_source_len} / 4")
-math(EXPR split_idx_2 "${rocfft_device_source_len} / 4 * 2")
-math(EXPR split_idx_3 "${rocfft_device_source_len} / 4 * 3")
-
-list( SUBLIST rocfft_device_source 0 ${split_len} rocfft_device_source_0 )
-list( SUBLIST rocfft_device_source ${split_len} ${split_len} rocfft_device_source_1 )
-list( SUBLIST rocfft_device_source ${split_idx_2} ${split_len} rocfft_device_source_2 )
-list( SUBLIST rocfft_device_source ${split_idx_3} -1 rocfft_device_source_3 )
-
-foreach( sub RANGE 3 )
- set( rocfft_device_source_var rocfft_device_source_${sub} )
+foreach( sub ${AMDGPU_TARGETS} )
if(NOT SINGLELIB)
- add_library( rocfft-device-${sub}
- ${${rocfft_device_source_var}} )
+ add_library( rocfft-device-${sub} EXCLUDE_FROM_ALL
+ ${rocfft_device_source} )
else()
# Compile the device lib as a static library, which is then linked
# into librocfft.so Useful for testing purposes.
- add_library( rocfft-device-${sub} STATIC
- ${${rocfft_device_source_var}} )
+ add_library( rocfft-device-${sub} STATIC EXCLUDE_FROM_ALL
+ ${rocfft_device_source} )
# if we're building singlelib, we don't want to export any of the
# device library symbols to the main library
@@ -241,9 +230,7 @@ foreach( sub RANGE 3 )
# Set AMD GPU architecture options
# Enable compilation of desired architectures
- foreach( target ${AMDGPU_TARGETS} )
- target_compile_options( rocfft-device-${sub} PRIVATE --offload-arch=${target} )
- endforeach( )
+ target_compile_options( rocfft-device-${sub} PRIVATE --offload-arch=${sub} )
target_include_directories( rocfft-device-${sub}
PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
@@ -268,9 +255,4 @@ foreach( sub RANGE 3 )
if( NOT BUILD_SHARED_LIBS )
set_target_properties( rocfft-device-${sub} PROPERTIES PREFIX "lib" )
endif( )
-
- rocm_install_targets(
- TARGETS
- rocfft-device-${sub}
- )
endforeach()