From c9246af112470cd83369158d883fcbf6b52b42ee Mon Sep 17 00:00:00 2001 From: Brian Van Essen Date: Mon, 22 Feb 2021 14:35:58 -0800 Subject: [PATCH] Adding support for ROCm for the LBANN software stack. (#21716) * Also removed LBANN CUDA CMake flags that are set by the version of Hydrogen that is compiled against. * Updated recipes to use HWLOC 2.3 with ROCm to enable topology awareness. Co-authored-by: Harmen Stoppels --- .../builtin/packages/aluminum/package.py | 23 ++++++++++++-- .../builtin/packages/dihydrogen/package.py | 20 +++++++++++- .../builtin/packages/hydrogen/package.py | 26 ++++++++++++++-- .../repos/builtin/packages/lbann/package.py | 31 +++++++++++++++++-- 4 files changed, 92 insertions(+), 8 deletions(-) diff --git a/var/spack/repos/builtin/packages/aluminum/package.py b/var/spack/repos/builtin/packages/aluminum/package.py index 247ac56d2f..b835019f7a 100644 --- a/var/spack/repos/builtin/packages/aluminum/package.py +++ b/var/spack/repos/builtin/packages/aluminum/package.py @@ -7,7 +7,7 @@ from spack import * -class Aluminum(CMakePackage, CudaPackage): +class Aluminum(CMakePackage, CudaPackage, ROCmPackage): """Aluminum provides a generic interface to high-performance communication libraries, with a focus on allreduce algorithms. Blocking and non-blocking algorithms and GPU-aware @@ -38,13 +38,19 @@ class Aluminum(CMakePackage, CudaPackage): ' communication of accelerator data') variant('cuda_rma', default=False, description='Builds with support for CUDA intra-node ' ' Put/Get and IPC RMA functionality') + variant('rccl', default=False, description='Builds with support for NCCL communication lib') depends_on('cmake@3.17.0:', type='build') depends_on('mpi') depends_on('nccl', when='+nccl') depends_on('hwloc@1.11:') depends_on('hwloc +cuda +nvml', when='+cuda') + depends_on('hwloc@2.3.0:', when='+rocm') depends_on('cub', when='@:0.1,0.6.0: +cuda ^cuda@:10.99') + depends_on('hipcub', when='@:0.1,0.6.0: +rocm') + + conflicts('~cuda', when='+cuda_rma', msg='CUDA RMA support requires CUDA') + conflicts('+cuda', when='+rocm', msg='CUDA and ROCm support are mutually exclusive') generator = 'Ninja' depends_on('ninja', type='build') @@ -54,7 +60,8 @@ def cmake_args(self): args = [ '-DCMAKE_CXX_STANDARD=14', '-DALUMINUM_ENABLE_CUDA:BOOL=%s' % ('+cuda' in spec), - '-DALUMINUM_ENABLE_NCCL:BOOL=%s' % ('+nccl' in spec)] + '-DALUMINUM_ENABLE_NCCL:BOOL=%s' % ('+nccl' in spec or '+rccl' in spec), + '-DALUMINUM_ENABLE_ROCM:BOOL=%s' % ('+rocm' in spec)] if '+cuda' in spec: args.append('-DCMAKE_CUDA_STANDARD=14') @@ -82,4 +89,16 @@ def cmake_args(self): args.extend([ '-DOpenMP_DIR={0}'.format(clang_root)]) + if '+rocm' in spec: + args.extend([ + '-DHIP_ROOT_DIR={0}'.format(spec['hip'].prefix), + '-DHIP_CXX_COMPILER={0}'.format(self.spec['hip'].hipcc)]) + archs = self.spec.variants['amdgpu_target'].value + if archs != 'none': + arch_str = ",".join(archs) + args.append( + '-DHIP_HIPCC_FLAGS=--amdgpu-target={0}' + ' -g -fsized-deallocation -fPIC'.format(arch_str) + ) + return args diff --git a/var/spack/repos/builtin/packages/dihydrogen/package.py b/var/spack/repos/builtin/packages/dihydrogen/package.py index 0591eaaafb..9f4b74ec35 100644 --- a/var/spack/repos/builtin/packages/dihydrogen/package.py +++ b/var/spack/repos/builtin/packages/dihydrogen/package.py @@ -7,7 +7,7 @@ from spack import * -class Dihydrogen(CMakePackage, CudaPackage): +class Dihydrogen(CMakePackage, CudaPackage, ROCmPackage): """DiHydrogen is the second version of the Hydrogen fork of the well-known distributed linear algebra library, Elemental. DiHydrogen aims to be a basic distributed @@ -77,10 +77,16 @@ class Dihydrogen(CMakePackage, CudaPackage): # Add Aluminum variants depends_on('aluminum +cuda +nccl +ht +cuda_rma', when='+al +cuda') + depends_on('aluminum +rocm +rccl +ht +cuda_rma', when='+al +rocm') for arch in CudaPackage.cuda_arch_values: depends_on('aluminum cuda_arch=%s' % arch, when='+al +cuda cuda_arch=%s' % arch) + # variants +rocm and amdgpu_targets are not automatically passed to + # dependencies, so do it manually. + for val in ROCmPackage.amdgpu_targets: + depends_on('aluminum amdgpu_target=%s' % val, when='amdgpu_target=%s' % val) + depends_on('cuda', when=('+cuda' or '+legacy')) depends_on('cudnn', when=('+cuda' or '+legacy')) depends_on('cub', when='^cuda@:10.99') @@ -190,6 +196,18 @@ def cmake_args(self): '-DOpenMP_libomp_LIBRARY={0}/lib/libomp.dylib'.format( clang_root)]) + if '+rocm' in spec: + args.extend([ + '-DHIP_ROOT_DIR={0}'.format(spec['hip'].prefix), + '-DHIP_CXX_COMPILER={0}'.format(self.spec['hip'].hipcc)]) + archs = self.spec.variants['amdgpu_target'].value + if archs != 'none': + arch_str = ",".join(archs) + args.append( + '-DHIP_HIPCC_FLAGS=--amdgpu-target={0}' + ' -g -fsized-deallocation -fPIC'.format(arch_str) + ) + return args def setup_build_environment(self, env): diff --git a/var/spack/repos/builtin/packages/hydrogen/package.py b/var/spack/repos/builtin/packages/hydrogen/package.py index 72ce07f188..2998aa1342 100644 --- a/var/spack/repos/builtin/packages/hydrogen/package.py +++ b/var/spack/repos/builtin/packages/hydrogen/package.py @@ -7,7 +7,7 @@ from spack import * -class Hydrogen(CMakePackage, CudaPackage): +class Hydrogen(CMakePackage, CudaPackage, ROCmPackage): """Hydrogen: Distributed-memory dense and sparse-direct linear algebra and optimization library. Based on the Elemental library.""" @@ -64,11 +64,13 @@ class Hydrogen(CMakePackage, CudaPackage): description='Builds with support for FP16 precision data types') conflicts('~openmp', when='+omp_taskloops') + conflicts('+cuda', when='+rocm', msg='CUDA and ROCm support are mutually exclusive') depends_on('cmake@3.17.0:', type='build') depends_on('mpi') depends_on('hwloc@1.11:') depends_on('hwloc +cuda +nvml', when='+cuda') + depends_on('hwloc@2.3.0:', when='+rocm') # Note that #1712 forces us to enumerate the different blas variants depends_on('openblas', when='blas=openblas') @@ -96,10 +98,16 @@ class Hydrogen(CMakePackage, CudaPackage): # Add Aluminum variants depends_on('aluminum +cuda +nccl +ht +cuda_rma', when='+al +cuda') + depends_on('aluminum +rocm +rccl +ht', when='+al +rocm') for arch in CudaPackage.cuda_arch_values: depends_on('aluminum cuda_arch=%s' % arch, when='+al +cuda cuda_arch=%s' % arch) + # variants +rocm and amdgpu_targets are not automatically passed to + # dependencies, so do it manually. + for val in ROCmPackage.amdgpu_targets: + depends_on('aluminum amdgpu_target=%s' % val, when='+al +rocm amdgpu_target=%s' % val) + # Note that this forces us to use OpenBLAS until #1712 is fixed depends_on('lapack', when='blas=openblas ~openmp_blas') @@ -110,6 +118,7 @@ class Hydrogen(CMakePackage, CudaPackage): depends_on('cuda', when='+cuda') depends_on('cub', when='^cuda@:10.99') + depends_on('hipcub', when='+rocm') depends_on('half', when='+half') depends_on('llvm-openmp', when='%apple-clang +openmp') @@ -143,8 +152,9 @@ def cmake_args(self): '-DHydrogen_ENABLE_MPC:BOOL=%s' % ('+mpfr' in spec), '-DHydrogen_GENERAL_LAPACK_FALLBACK=ON', '-DHydrogen_ENABLE_ALUMINUM=%s' % ('+al' in spec), - '-DHydrogen_ENABLE_CUB=%s' % ('+cuda' in spec), + '-DHydrogen_ENABLE_CUB=%s' % ('+cuda' in spec or '+rocm' in spec), '-DHydrogen_ENABLE_CUDA=%s' % ('+cuda' in spec), + '-DHydrogen_ENABLE_ROCM=%s' % ('+rocm' in spec), '-DHydrogen_ENABLE_TESTING=%s' % ('+test' in spec), '-DHydrogen_ENABLE_HALF=%s' % ('+half' in spec), '-DHydrogen_ENABLE_GPU_FP16=%s' % enable_gpu_fp16, @@ -153,6 +163,18 @@ def cmake_args(self): if '+cuda' in spec: args.append('-DCMAKE_CUDA_STANDARD=14') + if '+rocm' in spec: + args.extend([ + '-DHIP_ROOT_DIR={0}'.format(spec['hip'].prefix), + '-DHIP_CXX_COMPILER={0}'.format(self.spec['hip'].hipcc)]) + archs = self.spec.variants['amdgpu_target'].value + if archs != 'none': + arch_str = ",".join(archs) + args.append( + '-DHIP_HIPCC_FLAGS=--amdgpu-target={0}' + ' -g -fsized-deallocation -fPIC'.format(arch_str) + ) + # Add support for OS X to find OpenMP (LLVM installed via brew) if self.spec.satisfies('%clang +openmp platform=darwin'): clang = self.compiler.cc diff --git a/var/spack/repos/builtin/packages/lbann/package.py b/var/spack/repos/builtin/packages/lbann/package.py index fdc4b4a777..241fc433b6 100644 --- a/var/spack/repos/builtin/packages/lbann/package.py +++ b/var/spack/repos/builtin/packages/lbann/package.py @@ -7,7 +7,7 @@ from spack import * -class Lbann(CMakePackage, CudaPackage): +class Lbann(CMakePackage, CudaPackage, ROCmPackage): """LBANN: Livermore Big Artificial Neural Network Toolkit. A distributed memory, HPC-optimized, model and data parallel training toolkit for deep neural networks.""" @@ -73,6 +73,7 @@ class Lbann(CMakePackage, CudaPackage): conflicts('~cuda', when='+nvprof') conflicts('~hwloc', when='+al') conflicts('~cuda', when='+nvshmem') + conflicts('+cuda', when='+rocm', msg='CUDA and ROCm support are mutually exclusive') depends_on('cmake@3.17.0:', type='build') @@ -89,6 +90,8 @@ class Lbann(CMakePackage, CudaPackage): depends_on('hydrogen +cuda', when='+cuda') depends_on('hydrogen ~half', when='~half') depends_on('hydrogen +half', when='+half') + depends_on('hydrogen ~rocm', when='~rocm') + depends_on('hydrogen +rocm', when='+rocm') depends_on('hydrogen build_type=Debug', when='build_type=Debug') # Older versions depended on Elemental not Hydrogen @@ -103,6 +106,7 @@ class Lbann(CMakePackage, CudaPackage): # Add Aluminum variants depends_on('aluminum +cuda +nccl +ht +cuda_rma', when='+al +cuda') + depends_on('aluminum +rocm +rccl +ht', when='+al +rocm') depends_on('dihydrogen +openmp', when='+dihydrogen') depends_on('dihydrogen ~cuda', when='+dihydrogen ~cuda') @@ -114,6 +118,8 @@ class Lbann(CMakePackage, CudaPackage): depends_on('dihydrogen +half', when='+dihydrogen +half') depends_on('dihydrogen ~nvshmem', when='+dihydrogen ~nvshmem') depends_on('dihydrogen +nvshmem', when='+dihydrogen +nvshmem') + depends_on('dihydrogen ~rocm', when='+dihydrogen ~rocm') + depends_on('dihydrogen +rocm', when='+dihydrogen +rocm') depends_on('dihydrogen@0.1', when='@0.101:0.101.99 +dihydrogen') depends_on('dihydrogen@:0.0,0.2:', when='@:0.90,0.102: +dihydrogen') conflicts('~dihydrogen', when='+distconv') @@ -124,13 +130,22 @@ class Lbann(CMakePackage, CudaPackage): depends_on('dihydrogen cuda_arch=%s' % arch, when='+dihydrogen cuda_arch=%s' % arch) depends_on('nccl cuda_arch=%s' % arch, when='+cuda cuda_arch=%s' % arch) + # variants +rocm and amdgpu_targets are not automatically passed to + # dependencies, so do it manually. + for val in ROCmPackage.amdgpu_targets: + depends_on('hydrogen amdgpu_target=%s' % val, when='amdgpu_target=%s' % val) + depends_on('aluminum amdgpu_target=%s' % val, when='+al amdgpu_target=%s' % val) + depends_on('dihydrogen amdgpu_target=%s' % val, when='+dihydrogen amdgpu_target=%s' % val) + depends_on('cudnn', when='@0.90:0.100.99 +cuda') depends_on('cudnn@8.0.2:', when='@:0.90,0.101: +cuda') depends_on('cub', when='@0.94:0.98.2 +cuda ^cuda@:10.99') + depends_on('hipcub', when='+rocm') depends_on('mpi') depends_on('hwloc@1.11:', when='@:0.90,0.102: +hwloc') depends_on('hwloc@1.11:1.11.99', when='@0.95:0.101.99 +hwloc') depends_on('hwloc +cuda +nvml', when='+cuda') + depends_on('hwloc@2.3.0:', when='+rocm') depends_on('half', when='+half') @@ -236,8 +251,6 @@ def cmake_args(self): '-DLBANN_WITH_HWLOC=%s' % ('+hwloc' in spec), '-DLBANN_WITH_ALUMINUM:BOOL=%s' % ('+al' in spec), '-DLBANN_WITH_CONDUIT:BOOL=%s' % ('+conduit' in spec), - '-DLBANN_WITH_CUDA:BOOL=%s' % ('+cuda' in spec), - '-DLBANN_WITH_CUDNN:BOOL=%s' % ('+cuda' in spec), '-DLBANN_WITH_NVSHMEM:BOOL=%s' % ('+nvshmem' in spec), '-DLBANN_WITH_FFT:BOOL=%s' % ('+fft' in spec), '-DLBANN_WITH_ONEDNN:BOOL=%s' % ('+onednn' in spec), @@ -322,6 +335,18 @@ def cmake_args(self): args.append( '-DLBANN_WITH_DISTCONV:BOOL=%s' % ('+distconv' in spec)) + if '+rocm' in spec: + args.extend([ + '-DHIP_ROOT_DIR={0}'.format(spec['hip'].prefix), + '-DHIP_CXX_COMPILER={0}'.format(self.spec['hip'].hipcc)]) + archs = self.spec.variants['amdgpu_target'].value + if archs != 'none': + arch_str = ",".join(archs) + args.append( + '-DHIP_HIPCC_FLAGS=--amdgpu-target={0}' + ' -g -fsized-deallocation -fPIC -std=c++17'.format(arch_str) + ) + return args @when('@0.91:0.93')