From 8ca7c7700895032d7ec9fe728146e3ac0dbd0a64 Mon Sep 17 00:00:00 2001 From: Brian Van Essen Date: Mon, 7 Aug 2017 11:41:13 -0700 Subject: [PATCH] Elemental cublas (#4889) * Added a package for the MDAnalysis toolkit. * Added a patch that allows Elemental to use cuBLAS internally. * Added support for LBANN to use the new cuBLAS extension in Elemental. * Added a proper variant for when LBANN does not want to use cuBLAS in elemental. * Added a package for the cnpy project and used it in the lbann package. * Removed unnecessary comment lines. * Removed blank lines * Removed debug variant * Add support for libjpeg-turbo * Added additional variants for OpenCV features. Fixed bug when linking in TIFF support, where libtiff used the regular JPEG library and OpenCV used libjpeg-turbo. Now libtiff can use libjpeg-turbo. * Removed the variant for getting Elemental to use the cublas variant. Updated the requirements for OpenCV to add new options. * Fixed a flake8 error in OpenCV and added a path to find cnpy in lbann. * Fixed line too long flake8 error. * Added a flag to specify the datatype size in lbann and fixed a flake8 error. * Added a debug build variant using hte new build_type * Fixed flake8 * Fixed how the debug build is pushed to Elemental * Fixed a bug in the Elemental package where the blas search flags were being overridden by the blas link flags. Changed how the sequential initialization variant is implemented in LBANN. * Added support via a variant to explicitly use mkl or openblas. This helps work around variant forwarding problems. * Updated package files to address pull request comments. --- .../repos/builtin/packages/cnpy/package.py | 34 + .../packages/elemental/elemental_cublas.patch | 668 ++++++++++++++++++ .../builtin/packages/elemental/package.py | 20 +- .../repos/builtin/packages/lbann/package.py | 26 +- .../repos/builtin/packages/libtiff/package.py | 5 +- .../repos/builtin/packages/opencv/package.py | 93 ++- 6 files changed, 804 insertions(+), 42 deletions(-) create mode 100644 var/spack/repos/builtin/packages/cnpy/package.py create mode 100644 var/spack/repos/builtin/packages/elemental/elemental_cublas.patch diff --git a/var/spack/repos/builtin/packages/cnpy/package.py b/var/spack/repos/builtin/packages/cnpy/package.py new file mode 100644 index 0000000000..b62df10c2e --- /dev/null +++ b/var/spack/repos/builtin/packages/cnpy/package.py @@ -0,0 +1,34 @@ +############################################################################## +# Copyright (c) 2013-2016, Lawrence Livermore National Security, LLC. +# Produced at the Lawrence Livermore National Laboratory. +# +# This file is part of Spack. +# Created by Todd Gamblin, tgamblin@llnl.gov, All rights reserved. +# LLNL-CODE-647188 +# +# For details, see https://github.com/llnl/spack +# Please also see the NOTICE and LICENSE files for our notice and the LGPL. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License (as +# published by the Free Software Foundation) version 2.1, February 1999. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the IMPLIED WARRANTY OF +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the terms and +# conditions of the GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +############################################################################## +from spack import * + + +class Cnpy(CMakePackage): + """cnpy: library to read/write .npy and .npz files in C/C++.""" + + homepage = "https://github.com/rogersce/cnpy" + url = "https://github.com/rogersce/cnpy" + + version('master', git='https://github.com/rogersce/cnpy.git', branch="master") diff --git a/var/spack/repos/builtin/packages/elemental/elemental_cublas.patch b/var/spack/repos/builtin/packages/elemental/elemental_cublas.patch new file mode 100644 index 0000000000..9cf9b6e6b5 --- /dev/null +++ b/var/spack/repos/builtin/packages/elemental/elemental_cublas.patch @@ -0,0 +1,668 @@ +diff -Naur a/include/El/blas_like/level3.hpp b/include/El/blas_like/level3.hpp +--- a/include/El/blas_like/level3.hpp 2017-06-08 07:30:43.180249917 -0700 ++++ b/include/El/blas_like/level3.hpp 2017-06-08 07:35:27.325434602 -0700 +@@ -31,6 +31,10 @@ + } + using namespace GemmAlgorithmNS; + ++void GemmUseGPU(int min_M, int min_N, int min_K); ++ ++void GemmUseCPU(); ++ + template + void Gemm + ( Orientation orientA, Orientation orientB, +diff -Naur a/include/El/core/imports/blas.hpp b/include/El/core/imports/blas.hpp +--- a/include/El/core/imports/blas.hpp 2017-06-08 07:30:43.522016908 -0700 ++++ b/include/El/core/imports/blas.hpp 2017-06-08 07:35:06.834030908 -0700 +@@ -916,4 +916,63 @@ + } // namespace blas + } // namespace El + ++ ++#if defined(EL_USE_CUBLAS) ++ ++namespace El { ++ ++#ifdef EL_USE_64BIT_BLAS_INTS ++typedef long long int BlasInt; ++#else ++typedef int BlasInt; ++#endif ++ ++namespace cublas { ++ ++// NOTE: templated routines are custom and not wrappers ++ ++// Level 3 BLAS ++// ============ ++template ++void Gemm ++( char transA, char transB, BlasInt m, BlasInt n, BlasInt k, ++ const T& alpha, ++ const T* A, BlasInt ALDim, ++ const T* B, BlasInt BLDim, ++ const T& beta, ++ T* C, BlasInt CLDim ); ++ ++void Gemm ++( char transA, char transB, BlasInt m, BlasInt n, BlasInt k, ++ const float& alpha, ++ const float* A, BlasInt ALDim, ++ const float* B, BlasInt BLDim, ++ const float& beta, ++ float* C, BlasInt CLDim ); ++void Gemm ++( char transA, char transB, BlasInt m, BlasInt n, BlasInt k, ++ const double& alpha, ++ const double* A, BlasInt ALDim, ++ const double* B, BlasInt BLDim, ++ const double& beta, ++ double* C, BlasInt CLDim ); ++void Gemm ++( char transA, char transB, BlasInt m, BlasInt n, BlasInt k, ++ const scomplex& alpha, ++ const scomplex* A, BlasInt ALDim, ++ const scomplex* B, BlasInt BLDim, ++ const scomplex& beta, ++ scomplex* C, BlasInt CLDim ); ++void Gemm ++( char transA, char transB, BlasInt m, BlasInt n, BlasInt k, ++ const dcomplex& alpha, ++ const dcomplex* A, BlasInt ALDim, ++ const dcomplex* B, BlasInt BLDim, ++ const dcomplex& beta, ++ dcomplex* C, BlasInt CLDim ); ++ ++} // namespace cublas ++} // namespace El ++#endif ++ + #endif // ifndef EL_IMPORTS_BLAS_DECL_HPP +diff -Naur a/src/blas_like/level3/Gemm.cpp b/src/blas_like/level3/Gemm.cpp +--- a/src/blas_like/level3/Gemm.cpp 2017-06-08 07:30:44.307096427 -0700 ++++ b/src/blas_like/level3/Gemm.cpp 2017-06-08 07:34:23.062863489 -0700 +@@ -16,6 +16,20 @@ + + namespace El { + ++char gemm_cpu_gpu_switch = 'c'; ++int min_M = 0, min_N = 0, min_K = 0; ++ ++void GemmUseGPU(int _min_M, int _min_N, int _min_K) { ++ gemm_cpu_gpu_switch = 'g'; ++ min_M = _min_M; ++ min_N = _min_N; ++ min_K = _min_K; ++} ++ ++void GemmUseCPU() { ++ gemm_cpu_gpu_switch = 'c'; ++} ++ + template + void Gemm + ( Orientation orientA, Orientation orientB, +@@ -59,11 +73,30 @@ + const Int k = ( orientA == NORMAL ? A.Width() : A.Height() ); + if( k != 0 ) + { ++#if defined(EL_USE_CUBLAS) ++ if (gemm_cpu_gpu_switch == 'g' && ++ m >= min_M && ++ n >= min_N && ++ k >= min_K) { ++ cublas::Gemm ++ ( transA, transB, m, n, k, ++ alpha, A.LockedBuffer(), A.LDim(), ++ B.LockedBuffer(), B.LDim(), ++ beta, C.Buffer(), C.LDim() ); ++ } else { ++ blas::Gemm ++ ( transA, transB, m, n, k, ++ alpha, A.LockedBuffer(), A.LDim(), ++ B.LockedBuffer(), B.LDim(), ++ beta, C.Buffer(), C.LDim() ); ++ } ++#else + blas::Gemm + ( transA, transB, m, n, k, + alpha, A.LockedBuffer(), A.LDim(), + B.LockedBuffer(), B.LDim(), + beta, C.Buffer(), C.LDim() ); ++#endif + } + else + { +diff -Naur a/src/core/imports/blas/Gemm.hpp b/src/core/imports/blas/Gemm.hpp +--- a/src/core/imports/blas/Gemm.hpp 2017-06-08 07:30:45.090529967 -0700 ++++ b/src/core/imports/blas/Gemm.hpp 2017-06-08 07:34:46.503009958 -0700 +@@ -41,6 +41,12 @@ + + } // extern "C" + ++ ++#if defined(EL_USE_CUBLAS) ++#include ++#include ++#endif ++ + namespace El { + namespace blas { + +@@ -515,3 +521,515 @@ + + } // namespace blas + } // namespace El ++ ++ ++#if EL_USE_CUBLAS ++ ++#define USE_CUB 1 ++ ++namespace El { ++namespace cublas { ++ ++#if USE_CUB ++cub::CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory ++#endif ++ ++template ++void Gemm ++( char transA, char transB, ++ BlasInt m, BlasInt n, BlasInt k, ++ const T& alpha, ++ const T* A, BlasInt ALDim, ++ const T* B, BlasInt BLDim, ++ const T& beta, ++ T* C, BlasInt CLDim ) ++{ ++ // put something here ++ printf("integer version \n"); ++} ++template void Gemm ++( char transA, char transB, ++ BlasInt m, BlasInt n, BlasInt k, ++ const Int& alpha, ++ const Int* A, BlasInt ALDim, ++ const Int* B, BlasInt BLDim, ++ const Int& beta, ++ Int* C, BlasInt CLDim ); ++#ifdef EL_HAVE_QD ++template void Gemm ++( char transA, char transB, ++ BlasInt m, BlasInt n, BlasInt k, ++ const DoubleDouble& alpha, ++ const DoubleDouble* A, BlasInt ALDim, ++ const DoubleDouble* B, BlasInt BLDim, ++ const DoubleDouble& beta, ++ DoubleDouble* C, BlasInt CLDim ); ++template void Gemm ++( char transA, char transB, ++ BlasInt m, BlasInt n, BlasInt k, ++ const QuadDouble& alpha, ++ const QuadDouble* A, BlasInt ALDim, ++ const QuadDouble* B, BlasInt BLDim, ++ const QuadDouble& beta, ++ QuadDouble* C, BlasInt CLDim ); ++template void Gemm ++( char transA, char transB, ++ BlasInt m, BlasInt n, BlasInt k, ++ const Complex& alpha, ++ const Complex* A, BlasInt ALDim, ++ const Complex* B, BlasInt BLDim, ++ const Complex& beta, ++ Complex* C, BlasInt CLDim ); ++template void Gemm ++( char transA, char transB, ++ BlasInt m, BlasInt n, BlasInt k, ++ const Complex& alpha, ++ const Complex* A, BlasInt ALDim, ++ const Complex* B, BlasInt BLDim, ++ const Complex& beta, ++ Complex* C, BlasInt CLDim ); ++#endif ++#ifdef EL_HAVE_QUAD ++template void Gemm ++( char transA, char transB, ++ BlasInt m, BlasInt n, BlasInt k, ++ const Quad& alpha, ++ const Quad* A, BlasInt ALDim, ++ const Quad* B, BlasInt BLDim, ++ const Quad& beta, ++ Quad* C, BlasInt CLDim ); ++template void Gemm ++( char transA, char transB, ++ BlasInt m, BlasInt n, BlasInt k, ++ const Complex& alpha, ++ const Complex* A, BlasInt ALDim, ++ const Complex* B, BlasInt BLDim, ++ const Complex& beta, ++ Complex* C, BlasInt CLDim ); ++#endif ++#ifdef EL_HAVE_MPC ++template void Gemm ++( char transA, char transB, ++ BlasInt m, BlasInt n, BlasInt k, ++ const BigInt& alpha, ++ const BigInt* A, BlasInt ALDim, ++ const BigInt* B, BlasInt BLDim, ++ const BigInt& beta, ++ BigInt* C, BlasInt CLDim ); ++template void Gemm ++( char transA, char transB, ++ BlasInt m, BlasInt n, BlasInt k, ++ const BigFloat& alpha, ++ const BigFloat* A, BlasInt ALDim, ++ const BigFloat* B, BlasInt BLDim, ++ const BigFloat& beta, ++ BigFloat* C, BlasInt CLDim ); ++template void Gemm ++( char transA, char transB, ++ BlasInt m, BlasInt n, BlasInt k, ++ const Complex& alpha, ++ const Complex* A, BlasInt ALDim, ++ const Complex* B, BlasInt BLDim, ++ const Complex& beta, ++ Complex* C, BlasInt CLDim ); ++#endif ++ ++void Gemm ++( char transA, char transB, ++ BlasInt m, BlasInt n, BlasInt k, ++ const float& alpha, ++ const float* A, BlasInt ALDim, ++ const float* B, BlasInt BLDim, ++ const float& beta, ++ float* C, BlasInt CLDim ) ++{ ++ EL_DEBUG_CSE ++ EL_DEBUG_ONLY( ++ if( std::toupper(transA) == 'N' ) ++ { ++ if( ALDim < Max(m,1) ) ++ LogicError("ALDim was too small: ALDim=",ALDim,",m=",m); ++ } ++ else ++ { ++ if( ALDim < Max(k,1) ) ++ LogicError("ALDim was too small: ALDim=",ALDim,",k=",k); ++ } ++ ++ if( std::toupper(transB) == 'N' ) ++ { ++ if( BLDim < Max(k,1) ) ++ LogicError("BLDim was too small: BLDim=",BLDim,",k=",k); ++ } ++ else ++ { ++ if( BLDim < Max(n,1) ) ++ LogicError("BLDim was too small: BLDim=",BLDim,",n=",n); ++ } ++ ++ if( CLDim < Max(m,1) ) ++ LogicError("CLDim was too small: CLDim=",CLDim,",m=",m); ++ ) ++ const char fixedTransA = ( std::toupper(transA) == 'C' ? 'T' : transA ); ++ const char fixedTransB = ( std::toupper(transB) == 'C' ? 'T' : transB ); ++ ++ const mpi::Comm comm; ++ const Int commRank = mpi::Rank( comm ); ++ if (commRank == 0) { ++ //printf("calling cublas Sgemm: m %d n %d k %d\n", m, n, k); ++ } ++ ++ BlasInt rowA, colA, rowB, colB, rowC, colC; ++ // device memory size for A, B and C ++ BlasInt sizeA, sizeB, sizeC; ++ float *devA=NULL, *devB=NULL, *devC=NULL; ++ ++ rowA = fixedTransA == 'T' ? k : m; ++ colA = fixedTransA == 'T' ? m : k; ++ rowB = fixedTransB == 'T' ? n : k; ++ colB = fixedTransB == 'T' ? k : n; ++ rowC = m; ++ colC = n; ++ sizeA = rowA * colA; ++ sizeB = rowB * colB; ++ sizeC = rowC * colC; ++ ++ cublasStatus stat; ++ ++#if USE_CUB ++ CubDebugExit(g_allocator.DeviceAllocate((void**)&devA, ++ sizeof(float) * (sizeA+sizeB+sizeC) )); ++#else ++ stat = cublasAlloc(sizeA+sizeB+sizeC, sizeof(float), (void **) &devA); ++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("Alloc A,B,C error\n"); } ++#endif ++ ++ devB = devA + sizeA; ++ devC = devB + sizeB; ++ ++ // copy matrix A, B and C to device ++ stat = cublasSetMatrix(rowA, colA, sizeof(float), A, ALDim, devA, rowA); ++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix A error\n"); } ++ ++ stat = cublasSetMatrix(rowB, colB, sizeof(float), B, BLDim, devB, rowB); ++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix B error\n"); } ++ ++ if (beta != 0.0) ++ { ++ stat = cublasSetMatrix(rowC, colC, sizeof(float), C, CLDim, devC, rowC); ++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix C error\n"); } ++ } ++ ++ // cublasgemm ++ cublasSgemm ++ ( fixedTransA, fixedTransB, m, n, k, ++ alpha, devA, rowA, devB, rowB, beta, devC, rowC ); ++ ++ // copy matrix C to host ++ stat = cublasGetMatrix(rowC, colC, sizeof(float), devC, rowC, C, CLDim); ++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("GetMatrix C error\n"); } ++ ++ // free ++#if USE_CUB ++ CubDebugExit(g_allocator.DeviceFree(devA)); ++#else ++ cublasFree(devA); ++#endif ++ //printf("CUBLAS float done ...\n"); ++} ++ ++void Gemm ++( char transA, char transB, ++ BlasInt m, BlasInt n, BlasInt k, ++ const double& alpha, ++ const double* A, BlasInt ALDim, ++ const double* B, BlasInt BLDim, ++ const double& beta, ++ double* C, BlasInt CLDim ) ++{ ++ EL_DEBUG_CSE ++ EL_DEBUG_ONLY( ++ if( std::toupper(transA) == 'N' ) ++ { ++ if( ALDim < Max(m,1) ) ++ LogicError("ALDim was too small: ALDim=",ALDim,",m=",m); ++ } ++ else ++ { ++ if( ALDim < Max(k,1) ) ++ LogicError("ALDim was too small: ALDim=",ALDim,",k=",k); ++ } ++ ++ if( std::toupper(transB) == 'N' ) ++ { ++ if( BLDim < Max(k,1) ) ++ LogicError("BLDim was too small: BLDim=",BLDim,",k=",k); ++ } ++ else ++ { ++ if( BLDim < Max(n,1) ) ++ LogicError("BLDim was too small: BLDim=",BLDim,",n=",n); ++ } ++ ++ if( CLDim < Max(m,1) ) ++ LogicError("CLDim was too small: CLDim=",CLDim,",m=",m); ++ ) ++ const char fixedTransA = ( std::toupper(transA) == 'C' ? 'T' : transA ); ++ const char fixedTransB = ( std::toupper(transB) == 'C' ? 'T' : transB ); ++ ++ const mpi::Comm comm; ++ const Int commRank = mpi::Rank( comm ); ++ if (commRank == 0) { ++ //printf("calling cublas Dgemm: m %d n %d k %d\n", m, n, k); ++ } ++ ++ BlasInt rowA, colA, rowB, colB, rowC, colC; ++ // device memory size for A, B and C ++ BlasInt sizeA, sizeB, sizeC; ++ double *devA=NULL, *devB=NULL, *devC=NULL; ++ ++ rowA = fixedTransA == 'T' ? k : m; ++ colA = fixedTransA == 'T' ? m : k; ++ rowB = fixedTransB == 'T' ? n : k; ++ colB = fixedTransB == 'T' ? k : n; ++ rowC = m; ++ colC = n; ++ sizeA = rowA * colA; ++ sizeB = rowB * colB; ++ sizeC = rowC * colC; ++ ++ cublasStatus stat; ++ ++#if USE_CUB ++ CubDebugExit(g_allocator.DeviceAllocate((void**)&devA, ++ sizeof(double) * (sizeA+sizeB+sizeC) )); ++#else ++ stat = cublasAlloc(sizeA+sizeB+sizeC, sizeof(double), (void **) &devA); ++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("Alloc A,B,C error\n"); } ++#endif ++ ++ devB = devA + sizeA; ++ devC = devB + sizeB; ++ ++ // copy matrix A, B and C to device ++ stat = cublasSetMatrix(rowA, colA, sizeof(double), A, ALDim, devA, rowA); ++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix A error\n"); } ++ ++ stat = cublasSetMatrix(rowB, colB, sizeof(double), B, BLDim, devB, rowB); ++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix B error\n"); } ++ ++ if (beta != 0.0) ++ { ++ stat = cublasSetMatrix(rowC, colC, sizeof(double), C, CLDim, devC, rowC); ++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix C error\n"); } ++ } ++ ++ // cublasgemm ++ cublasDgemm ++ ( fixedTransA, fixedTransB, m, n, k, ++ alpha, devA, rowA, devB, rowB, beta, devC, rowC ); ++ ++ // copy matrix C to host ++ stat = cublasGetMatrix(rowC, colC, sizeof(double), devC, rowC, C, CLDim); ++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("GetMatrix C error\n"); } ++ ++ // free ++#if USE_CUB ++ CubDebugExit(g_allocator.DeviceFree(devA)); ++#else ++ cublasFree(devA); ++#endif ++ //printf("CUBLAS double done ...\n"); ++} ++ ++void Gemm ++( char transA, char transB, BlasInt m, BlasInt n, BlasInt k, ++ const scomplex& alpha, ++ const scomplex* A, BlasInt ALDim, ++ const scomplex* B, BlasInt BLDim, ++ const scomplex& beta, ++ scomplex* C, BlasInt CLDim ) ++{ ++ EL_DEBUG_CSE ++ EL_DEBUG_ONLY( ++ if( std::toupper(transA) == 'N' ) ++ { ++ if( ALDim < Max(m,1) ) ++ LogicError("ALDim was too small: ALDim=",ALDim,",m=",m); ++ } ++ else ++ { ++ if( ALDim < Max(k,1) ) ++ LogicError("ALDim was too small: ALDim=",ALDim,",k=",k); ++ } ++ ++ if( std::toupper(transB) == 'N' ) ++ { ++ if( BLDim < Max(k,1) ) ++ LogicError("BLDim was too small: BLDim=",BLDim,",k=",k); ++ } ++ else ++ { ++ if( BLDim < Max(n,1) ) ++ LogicError("BLDim was too small: BLDim=",BLDim,",n=",n); ++ } ++ ++ if( CLDim < Max(m,1) ) ++ LogicError("CLDim was too small: CLDim=",CLDim,",m=",m); ++ ) ++ ++ const char fixedTransA = transA; ++ const char fixedTransB = transB; ++ ++ const mpi::Comm comm; ++ const Int commRank = mpi::Rank( comm ); ++ if (commRank == 0) { ++ //printf("calling cublas Cgemm: m %d n %d k %d\n", m, n, k); ++ } ++ ++ BlasInt rowA, colA, rowB, colB, rowC, colC; ++ // device memory size for A, B and C ++ BlasInt sizeA, sizeB, sizeC; ++ cuComplex *devA=NULL, *devB=NULL, *devC=NULL; ++ ++ rowA = fixedTransA == 'T' ? k : m; ++ colA = fixedTransA == 'T' ? m : k; ++ rowB = fixedTransB == 'T' ? n : k; ++ colB = fixedTransB == 'T' ? k : n; ++ rowC = m; ++ colC = n; ++ sizeA = rowA * colA; ++ sizeB = rowB * colB; ++ sizeC = rowC * colC; ++ ++ cublasStatus stat; ++ stat = cublasAlloc(sizeA+sizeB+sizeC, sizeof(cuComplex), (void **) &devA); ++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("Alloc A,B,C error\n"); } ++ ++ devB = devA + sizeA; ++ devC = devB + sizeB; ++ ++ // copy matrix A, B and C to device ++ stat = cublasSetMatrix(rowA, colA, sizeof(cuComplex), A, ALDim, devA, rowA); ++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix A error\n"); } ++ ++ stat = cublasSetMatrix(rowB, colB, sizeof(cuComplex), B, BLDim, devB, rowB); ++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix B error\n"); } ++ ++ if (beta.real() != 0.0 || beta.imag() != 0.0) ++ { ++ stat = cublasSetMatrix(rowC, colC, sizeof(cuComplex), C, CLDim, devC, rowC); ++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix C error\n"); } ++ } ++ ++ // cublasgemm ++ cublasCgemm ++ ( fixedTransA, fixedTransB, m, n, k, ++ *((cuComplex*) &alpha), devA, rowA, devB, rowB, *((cuComplex*) &beta), devC, rowC ); ++ ++ // copy matrix C to host ++ stat = cublasGetMatrix(rowC, colC, sizeof(cuComplex), devC, rowC, C, CLDim); ++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("GetMatrix C error\n"); } ++ ++ // free ++ cublasFree(devA); ++} ++ ++void Gemm ++( char transA, char transB, BlasInt m, BlasInt n, BlasInt k, ++ const dcomplex& alpha, ++ const dcomplex* A, BlasInt ALDim, ++ const dcomplex* B, BlasInt BLDim, ++ const dcomplex& beta, ++ dcomplex* C, BlasInt CLDim ) ++{ ++ EL_DEBUG_CSE ++ EL_DEBUG_ONLY( ++ if( std::toupper(transA) == 'N' ) ++ { ++ if( ALDim < Max(m,1) ) ++ LogicError("ALDim was too small: ALDim=",ALDim,",m=",m); ++ } ++ else ++ { ++ if( ALDim < Max(k,1) ) ++ LogicError("ALDim was too small: ALDim=",ALDim,",k=",k); ++ } ++ ++ if( std::toupper(transB) == 'N' ) ++ { ++ if( BLDim < Max(k,1) ) ++ LogicError("BLDim was too small: BLDim=",BLDim,",k=",k); ++ } ++ else ++ { ++ if( BLDim < Max(n,1) ) ++ LogicError("BLDim was too small: BLDim=",BLDim,",n=",n); ++ } ++ ++ if( CLDim < Max(m,1) ) ++ LogicError("CLDim was too small: CLDim=",CLDim,",m=",m); ++ ) ++ ++ const char fixedTransA = transA; ++ const char fixedTransB = transB; ++ ++ const mpi::Comm comm; ++ const Int commRank = mpi::Rank( comm ); ++ if (commRank == 0) { ++ //printf("calling cublas Zgemm: m %d n %d k %d\n", m, n, k); ++ } ++ ++ BlasInt rowA, colA, rowB, colB, rowC, colC; ++ // device memory size for A, B and C ++ BlasInt sizeA, sizeB, sizeC; ++ cuDoubleComplex *devA=NULL, *devB=NULL, *devC=NULL; ++ ++ rowA = fixedTransA == 'T' ? k : m; ++ colA = fixedTransA == 'T' ? m : k; ++ rowB = fixedTransB == 'T' ? n : k; ++ colB = fixedTransB == 'T' ? k : n; ++ rowC = m; ++ colC = n; ++ sizeA = rowA * colA; ++ sizeB = rowB * colB; ++ sizeC = rowC * colC; ++ ++ cublasStatus stat; ++ stat = cublasAlloc(sizeA+sizeB+sizeC, sizeof(cuDoubleComplex), (void **) &devA); ++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("Alloc A,B,C error\n"); } ++ ++ devB = devA + sizeA; ++ devC = devB + sizeB; ++ ++ // copy matrix A, B and C to device ++ stat = cublasSetMatrix(rowA, colA, sizeof(cuDoubleComplex), A, ALDim, devA, rowA); ++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix A error\n"); } ++ ++ stat = cublasSetMatrix(rowB, colB, sizeof(cuDoubleComplex), B, BLDim, devB, rowB); ++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix B error\n"); } ++ ++ if (beta.real() != 0.0 || beta.imag() != 0.0) ++ { ++ stat = cublasSetMatrix(rowC, colC, sizeof(cuDoubleComplex), C, CLDim, devC, rowC); ++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix C error\n"); } ++ } ++ ++ cublasZgemm ++ ( fixedTransA, fixedTransB, m, n, k, ++ *((cuDoubleComplex*) &alpha), devA, rowA, devB, rowB, *((cuDoubleComplex*) &beta), ++ devC, rowC ); ++ ++ // copy matrix C to host ++ stat = cublasGetMatrix(rowC, colC, sizeof(cuDoubleComplex), devC, rowC, C, CLDim); ++ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("GetMatrix C error\n"); } ++ ++ // free ++ cublasFree(devA); ++} ++ ++} // namespace cublas ++} // namespace El ++ ++#endif ++ diff --git a/var/spack/repos/builtin/packages/elemental/package.py b/var/spack/repos/builtin/packages/elemental/package.py index e118bcbd44..50fb4f9829 100644 --- a/var/spack/repos/builtin/packages/elemental/package.py +++ b/var/spack/repos/builtin/packages/elemental/package.py @@ -33,6 +33,7 @@ class Elemental(CMakePackage): homepage = "http://libelemental.org" url = "https://github.com/elemental/Elemental/archive/v0.87.6.tar.gz" + version('master', git='https://github.com/elemental/Elemental.git', branch='master') version('0.87.7', '6c1e7442021c59a36049e37ea69b8075') version('0.87.6', '9fd29783d45b0a0e27c0df85f548abe9') @@ -52,6 +53,8 @@ class Elemental(CMakePackage): description='Enable quad precision') variant('int64', default=False, description='Use 64bit integers') + variant('cublas', default=False, + description='Enable cuBLAS for local BLAS operations') # When this variant is set remove the normal dependencies since # Elemental has to build BLAS and ScaLAPACK internally variant('int64_blas', default=False, @@ -62,15 +65,21 @@ class Elemental(CMakePackage): variant('build_type', default='Release', description='The build type to build', values=('Debug', 'Release')) + variant('blas', default='openblas', values=('openblas', 'mkl'), + description='Enable the use of OpenBlas/MKL') - # Note that this forces us to use OpenBLAS until #1712 is fixed + # Note that #1712 forces us to enumerate the different blas variants depends_on('blas', when='~openmp_blas ~int64_blas') # Hack to forward variant to openblas package # Allow Elemental to build internally when using 8-byte ints - depends_on('openblas +openmp', when='+openmp_blas ~int64_blas') + depends_on('openblas +openmp', when='blas=openblas +openmp_blas ~int64_blas') + + depends_on('intel-mkl', when="blas=mkl ~openmp_blas ~int64_blas") + depends_on('intel-mkl +openmp', when='blas=mkl +openmp_blas ~int64_blas') + depends_on('intel-mkl@2017.1 +openmp +ilp64', when='blas=mkl +openmp_blas +int64_blas') # Note that this forces us to use OpenBLAS until #1712 is fixed - depends_on('lapack', when='~openmp_blas') + depends_on('lapack', when='blas=openblas ~openmp_blas') depends_on('metis') depends_on('metis +int64', when='+int64') depends_on('mpi') @@ -79,6 +88,8 @@ class Elemental(CMakePackage): extends('python', when='+python') depends_on('python@:2.8', when='+python') + patch('elemental_cublas.patch', when='+cublas') + @property def libs(self): shared = True if '+shared' in self.spec else False @@ -126,8 +137,7 @@ def cmake_args(self): math_libs = spec['scalapack'].libs + math_libs args.extend([ - '-DMATH_LIBS:STRING={0}'.format(math_libs.search_flags), - '-DMATH_LIBS:STRING={0}'.format(math_libs.link_flags)]) + '-DMATH_LIBS:STRING={0}'.format(math_libs.ld_flags)]) if '+python' in spec: args.extend([ diff --git a/var/spack/repos/builtin/packages/lbann/package.py b/var/spack/repos/builtin/packages/lbann/package.py index fea1924550..a93b9b5b66 100644 --- a/var/spack/repos/builtin/packages/lbann/package.py +++ b/var/spack/repos/builtin/packages/lbann/package.py @@ -39,37 +39,49 @@ class Lbann(CMakePackage): variant('gpu', default=False, description='Builds with support for GPUs via CUDA and cuDNN') variant('opencv', default=True, description='Builds with support for image processing routines with OpenCV') variant('seq_init', default=False, description='Force serial initialization of weight matrices.') + variant('dtype', default=4, description='Size (bits) of floating point representation for weights') + variant('build_type', default='Release', + description='The build type to build', + values=('Debug', 'Release')) depends_on('elemental +openmp_blas +scalapack +shared +int64') + depends_on('elemental +openmp_blas +scalapack +shared +int64 build_type=Debug', + when=('build_type=Debug')) depends_on('cuda', when='+gpu') depends_on('mpi') - depends_on('opencv@3.2.0', when='+opencv') + depends_on('opencv@3.2.0: +openmp +core +highgui +imgproc +jpeg +png +tiff +zlib', when='+opencv') depends_on('protobuf@3.0.2:') + depends_on('cnpy') def cmake_args(self): spec = self.spec # Environment variables CPPFLAGS = [] CPPFLAGS.append('-DLBANN_SET_EL_RNG') - if '~seq_init' in spec: - CPPFLAGS.append('-DLBANN_PARALLEL_RANDOM_MATRICES') + + CPPFLAGS.append('-DLBANN_DATATYPE={0}'.format( + int(spec.variants['dtype'].value))) args = [ '-DCMAKE_INSTALL_MESSAGE=LAZY', '-DCMAKE_CXX_FLAGS=%s' % ' '.join(CPPFLAGS), '-DWITH_CUDA:BOOL=%s' % ('+gpu' in spec), '-DWITH_CUDNN:BOOL=%s' % ('+gpu' in spec), + '-DELEMENTAL_USE_CUBLAS:BOOL=%s' % ( + '+cublas' in spec['elemental']), '-DWITH_TBINF=OFF', '-DWITH_VTUNE=OFF', - '-DElemental_DIR={0}'.format(self.spec['elemental'].prefix), + '-DElemental_DIR={0}'.format(spec['elemental'].prefix), + '-DCNPY_DIR={0}'.format(spec['cnpy'].prefix), '-DELEMENTAL_MATH_LIBS={0}'.format( - self.spec['elemental'].libs), + spec['elemental'].libs), + '-DSEQ_INIT:BOOL=%s' % ('+seq_init' in spec), '-DVERBOSE=0', '-DLBANN_HOME=.', '-DLBANN_VER=spack'] - if '+opencv' in self.spec: + if '+opencv' in spec: args.extend(['-DOpenCV_DIR:STRING={0}'.format( - self.spec['opencv'].prefix)]) + spec['opencv'].prefix)]) return args diff --git a/var/spack/repos/builtin/packages/libtiff/package.py b/var/spack/repos/builtin/packages/libtiff/package.py index 2fcccad739..29db7b42d3 100644 --- a/var/spack/repos/builtin/packages/libtiff/package.py +++ b/var/spack/repos/builtin/packages/libtiff/package.py @@ -35,6 +35,9 @@ class Libtiff(AutotoolsPackage): version('4.0.6', 'd1d2e940dea0b5ad435f21f03d96dd72') version('4.0.3', '051c1068e6a0627f461948c365290410') - depends_on('jpeg') + variant('turbo', default=False, description='use libjpeg-turbo') + + depends_on('jpeg', when='-turbo') + depends_on('libjpeg-turbo', when='+turbo') depends_on('zlib') depends_on('xz') diff --git a/var/spack/repos/builtin/packages/opencv/package.py b/var/spack/repos/builtin/packages/opencv/package.py index 33adadc15e..f2bda99a01 100644 --- a/var/spack/repos/builtin/packages/opencv/package.py +++ b/var/spack/repos/builtin/packages/opencv/package.py @@ -42,8 +42,15 @@ class Opencv(CMakePackage): homepage = 'http://opencv.org/' url = 'https://github.com/Itseez/opencv/archive/3.1.0.tar.gz' - version('3.2.0', 'a43b65488124ba33dde195fea9041b70') - version('3.1.0', '70e1dd07f0aa06606f1bc0e3fa15abd3') + version('master', git="https://github.com/opencv/opencv.git", branch="master") + version('3.2.0', 'a43b65488124ba33dde195fea9041b70') + version('3.1.0', '70e1dd07f0aa06606f1bc0e3fa15abd3') + version('2.4.13.2', 'fe52791ce523681a67036def4c25261b') + version('2.4.13.1', 'f6d354500d5013e60dc0fc44b07a63d1') + version('2.4.13', '8feb45a71adad89b8017a777477c3eff') + version('2.4.12.3', '2496a4a4caf8fecfbfc294fbe6a814b0') + version('2.4.12.2', 'bc0c60c2ea1cf4078deef99569912fc7') + version('2.4.12.1', '7192f51434710904b5e3594872b897c3') variant('shared', default=True, description='Enables the build of shared libraries') @@ -59,13 +66,21 @@ class Opencv(CMakePackage): description='Enables the build of Python extensions') variant('java', default=False, description='Activates support for Java') + variant('openmp', default=False, description='Activates support for OpenMP threads') + variant('core', default=False, description='Include opencv_core module into the OpenCV build') + variant('highgui', default=False, description='Include opencv_highgui module into the OpenCV build') + variant('imgproc', default=False, description='Include opencv_imgproc module into the OpenCV build') + variant('jpeg', default=False, description='Include JPEG support') + variant('png', default=False, description='Include PNG support') + variant('tiff', default=False, description='Include TIFF support') + variant('zlib', default=False, description='Build zlib from source') depends_on('eigen', when='+eigen', type='build') - depends_on('zlib') - depends_on('libpng') - depends_on('libjpeg-turbo') - depends_on('libtiff') + depends_on('zlib', when='+zlib') + depends_on('libpng', when='+png') + depends_on('libjpeg-turbo', when='+jpeg') + depends_on('libtiff+turbo', when='+tiff') depends_on('jasper', when='+jasper') depends_on('cuda', when='+cuda') @@ -94,6 +109,22 @@ def cmake_args(self): 'ON' if '+vtk' in spec else 'OFF')), '-DBUILD_opencv_java:BOOL={0}'.format(( 'ON' if '+java' in spec else 'OFF')), + '-DBUILD_opencv_core:BOOL={0}'.format(( + 'ON' if '+core' in spec else 'OFF')), + '-DBUILD_opencv_highgui:BOOL={0}'.format(( + 'ON' if '+highgui' in spec else 'OFF')), + '-DBUILD_opencv_imgproc:BOOL={0}'.format(( + 'ON' if '+imgproc' in spec else 'OFF')), + '-DWITH_JPEG:BOOL={0}'.format(( + 'ON' if '+jpeg' in spec else 'OFF')), + '-DWITH_PNG:BOOL={0}'.format(( + 'ON' if '+png' in spec else 'OFF')), + '-DWITH_TIFF:BOOL={0}'.format(( + 'ON' if '+tiff' in spec else 'OFF')), + '-DWITH_ZLIB:BOOL={0}'.format(( + 'ON' if '+zlib' in spec else 'OFF')), + '-DWITH_OPENMP:BOOL={0}'.format(( + 'ON' if '+openmp' in spec else 'OFF')), ] # Media I/O @@ -115,31 +146,35 @@ def cmake_args(self): '-DPNG_INCLUDE_DIR:PATH={0}'.format(libpng.prefix.include) ]) - libjpeg = spec['libjpeg-turbo'] - args.extend([ - '-DJPEG_LIBRARY:FILEPATH={0}'.format( - join_path(libjpeg.prefix.lib, - 'libjpeg.{0}'.format(dso_suffix))), - '-DJPEG_INCLUDE_DIR:PATH={0}'.format(libjpeg.prefix.include) - ]) + if '+jpeg' in spec: + libjpeg = spec['libjpeg-turbo'] + cmake_options.extend([ + '-DBUILD_JPEG:BOOL=OFF', + '-DJPEG_LIBRARY:FILEPATH={0}'.format( + join_path(libjpeg.prefix.lib, + 'libjpeg.{0}'.format(dso_suffix))), + '-DJPEG_INCLUDE_DIR:PATH={0}'.format(libjpeg.prefix.include) + ]) - libtiff = spec['libtiff'] - args.extend([ - '-DTIFF_LIBRARY_{0}:FILEPATH={1}'.format(( - 'DEBUG' if '+debug' in spec else 'RELEASE'), - join_path(libtiff.prefix.lib, - 'libtiff.{0}'.format(dso_suffix))), - '-DTIFF_INCLUDE_DIR:PATH={0}'.format(libtiff.prefix.include) - ]) + if '+tiff' in spec: + libtiff = spec['libtiff'] + cmake_options.extend([ + '-DTIFF_LIBRARY_{0}:FILEPATH={1}'.format(( + 'DEBUG' if '+debug' in spec else 'RELEASE'), + join_path(libtiff.prefix.lib, + 'libtiff.{0}'.format(dso_suffix))), + '-DTIFF_INCLUDE_DIR:PATH={0}'.format(libtiff.prefix.include) + ]) - jasper = spec['jasper'] - args.extend([ - '-DJASPER_LIBRARY_{0}:FILEPATH={1}'.format(( - 'DEBUG' if '+debug' in spec else 'RELEASE'), - join_path(jasper.prefix.lib, - 'libjasper.{0}'.format(dso_suffix))), - '-DJASPER_INCLUDE_DIR:PATH={0}'.format(jasper.prefix.include) - ]) + if '+jasper' in spec: + jasper = spec['jasper'] + cmake_options.extend([ + '-DJASPER_LIBRARY_{0}:FILEPATH={1}'.format(( + 'DEBUG' if '+debug' in spec else 'RELEASE'), + join_path(jasper.prefix.lib, + 'libjasper.{0}'.format(dso_suffix))), + '-DJASPER_INCLUDE_DIR:PATH={0}'.format(jasper.prefix.include) + ]) # GUI if '+gtk' not in spec: