Elemental cublas (#4889)

* Added a package for the MDAnalysis toolkit.

* Added a patch that allows Elemental to use cuBLAS internally.

* Added support for LBANN to use the new cuBLAS extension in Elemental.

* Added a proper variant for when LBANN does not want to use cuBLAS in
elemental.

* Added a package for the cnpy project and used it in the lbann package.

* Removed unnecessary comment lines.

* Removed blank lines

* Removed debug variant

* Add support for libjpeg-turbo

* Added additional variants for OpenCV features. Fixed bug when linking
in TIFF support, where libtiff used the regular JPEG library and
OpenCV used libjpeg-turbo.  Now libtiff can use libjpeg-turbo.

* Removed the variant for getting Elemental to use the cublas variant.
Updated the requirements for OpenCV to add new options.

* Fixed a flake8 error in OpenCV and added a path to find cnpy in lbann.

* Fixed line too long flake8 error.

* Added a flag to specify the datatype size in lbann and fixed a flake8 error.

* Added a debug build variant using hte new build_type

* Fixed flake8

* Fixed how the debug build is pushed to Elemental

* Fixed a bug in the Elemental package where the blas search flags were
being overridden by the blas link flags.  Changed how the sequential
initialization variant is implemented in LBANN.

* Added support via a variant to explicitly use mkl or openblas.  This
helps work around variant forwarding problems.

* Updated package files to address pull request comments.
This commit is contained in:
Brian Van Essen 2017-08-07 11:41:13 -07:00 committed by Adam J. Stewart
parent 755081968f
commit 8ca7c77008
6 changed files with 804 additions and 42 deletions

View file

@ -0,0 +1,34 @@
##############################################################################
# Copyright (c) 2013-2016, Lawrence Livermore National Security, LLC.
# Produced at the Lawrence Livermore National Laboratory.
#
# This file is part of Spack.
# Created by Todd Gamblin, tgamblin@llnl.gov, All rights reserved.
# LLNL-CODE-647188
#
# For details, see https://github.com/llnl/spack
# Please also see the NOTICE and LICENSE files for our notice and the LGPL.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License (as
# published by the Free Software Foundation) version 2.1, February 1999.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the IMPLIED WARRANTY OF
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the terms and
# conditions of the GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
##############################################################################
from spack import *
class Cnpy(CMakePackage):
"""cnpy: library to read/write .npy and .npz files in C/C++."""
homepage = "https://github.com/rogersce/cnpy"
url = "https://github.com/rogersce/cnpy"
version('master', git='https://github.com/rogersce/cnpy.git', branch="master")

View file

@ -0,0 +1,668 @@
diff -Naur a/include/El/blas_like/level3.hpp b/include/El/blas_like/level3.hpp
--- a/include/El/blas_like/level3.hpp 2017-06-08 07:30:43.180249917 -0700
+++ b/include/El/blas_like/level3.hpp 2017-06-08 07:35:27.325434602 -0700
@@ -31,6 +31,10 @@
}
using namespace GemmAlgorithmNS;
+void GemmUseGPU(int min_M, int min_N, int min_K);
+
+void GemmUseCPU();
+
template<typename T>
void Gemm
( Orientation orientA, Orientation orientB,
diff -Naur a/include/El/core/imports/blas.hpp b/include/El/core/imports/blas.hpp
--- a/include/El/core/imports/blas.hpp 2017-06-08 07:30:43.522016908 -0700
+++ b/include/El/core/imports/blas.hpp 2017-06-08 07:35:06.834030908 -0700
@@ -916,4 +916,63 @@
} // namespace blas
} // namespace El
+
+#if defined(EL_USE_CUBLAS)
+
+namespace El {
+
+#ifdef EL_USE_64BIT_BLAS_INTS
+typedef long long int BlasInt;
+#else
+typedef int BlasInt;
+#endif
+
+namespace cublas {
+
+// NOTE: templated routines are custom and not wrappers
+
+// Level 3 BLAS
+// ============
+template<typename T>
+void Gemm
+( char transA, char transB, BlasInt m, BlasInt n, BlasInt k,
+ const T& alpha,
+ const T* A, BlasInt ALDim,
+ const T* B, BlasInt BLDim,
+ const T& beta,
+ T* C, BlasInt CLDim );
+
+void Gemm
+( char transA, char transB, BlasInt m, BlasInt n, BlasInt k,
+ const float& alpha,
+ const float* A, BlasInt ALDim,
+ const float* B, BlasInt BLDim,
+ const float& beta,
+ float* C, BlasInt CLDim );
+void Gemm
+( char transA, char transB, BlasInt m, BlasInt n, BlasInt k,
+ const double& alpha,
+ const double* A, BlasInt ALDim,
+ const double* B, BlasInt BLDim,
+ const double& beta,
+ double* C, BlasInt CLDim );
+void Gemm
+( char transA, char transB, BlasInt m, BlasInt n, BlasInt k,
+ const scomplex& alpha,
+ const scomplex* A, BlasInt ALDim,
+ const scomplex* B, BlasInt BLDim,
+ const scomplex& beta,
+ scomplex* C, BlasInt CLDim );
+void Gemm
+( char transA, char transB, BlasInt m, BlasInt n, BlasInt k,
+ const dcomplex& alpha,
+ const dcomplex* A, BlasInt ALDim,
+ const dcomplex* B, BlasInt BLDim,
+ const dcomplex& beta,
+ dcomplex* C, BlasInt CLDim );
+
+} // namespace cublas
+} // namespace El
+#endif
+
#endif // ifndef EL_IMPORTS_BLAS_DECL_HPP
diff -Naur a/src/blas_like/level3/Gemm.cpp b/src/blas_like/level3/Gemm.cpp
--- a/src/blas_like/level3/Gemm.cpp 2017-06-08 07:30:44.307096427 -0700
+++ b/src/blas_like/level3/Gemm.cpp 2017-06-08 07:34:23.062863489 -0700
@@ -16,6 +16,20 @@
namespace El {
+char gemm_cpu_gpu_switch = 'c';
+int min_M = 0, min_N = 0, min_K = 0;
+
+void GemmUseGPU(int _min_M, int _min_N, int _min_K) {
+ gemm_cpu_gpu_switch = 'g';
+ min_M = _min_M;
+ min_N = _min_N;
+ min_K = _min_K;
+}
+
+void GemmUseCPU() {
+ gemm_cpu_gpu_switch = 'c';
+}
+
template<typename T>
void Gemm
( Orientation orientA, Orientation orientB,
@@ -59,11 +73,30 @@
const Int k = ( orientA == NORMAL ? A.Width() : A.Height() );
if( k != 0 )
{
+#if defined(EL_USE_CUBLAS)
+ if (gemm_cpu_gpu_switch == 'g' &&
+ m >= min_M &&
+ n >= min_N &&
+ k >= min_K) {
+ cublas::Gemm
+ ( transA, transB, m, n, k,
+ alpha, A.LockedBuffer(), A.LDim(),
+ B.LockedBuffer(), B.LDim(),
+ beta, C.Buffer(), C.LDim() );
+ } else {
+ blas::Gemm
+ ( transA, transB, m, n, k,
+ alpha, A.LockedBuffer(), A.LDim(),
+ B.LockedBuffer(), B.LDim(),
+ beta, C.Buffer(), C.LDim() );
+ }
+#else
blas::Gemm
( transA, transB, m, n, k,
alpha, A.LockedBuffer(), A.LDim(),
B.LockedBuffer(), B.LDim(),
beta, C.Buffer(), C.LDim() );
+#endif
}
else
{
diff -Naur a/src/core/imports/blas/Gemm.hpp b/src/core/imports/blas/Gemm.hpp
--- a/src/core/imports/blas/Gemm.hpp 2017-06-08 07:30:45.090529967 -0700
+++ b/src/core/imports/blas/Gemm.hpp 2017-06-08 07:34:46.503009958 -0700
@@ -41,6 +41,12 @@
} // extern "C"
+
+#if defined(EL_USE_CUBLAS)
+#include <cublas.h>
+#include <cub/util_allocator.cuh>
+#endif
+
namespace El {
namespace blas {
@@ -515,3 +521,515 @@
} // namespace blas
} // namespace El
+
+
+#if EL_USE_CUBLAS
+
+#define USE_CUB 1
+
+namespace El {
+namespace cublas {
+
+#if USE_CUB
+cub::CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory
+#endif
+
+template<typename T>
+void Gemm
+( char transA, char transB,
+ BlasInt m, BlasInt n, BlasInt k,
+ const T& alpha,
+ const T* A, BlasInt ALDim,
+ const T* B, BlasInt BLDim,
+ const T& beta,
+ T* C, BlasInt CLDim )
+{
+ // put something here
+ printf("integer version \n");
+}
+template void Gemm
+( char transA, char transB,
+ BlasInt m, BlasInt n, BlasInt k,
+ const Int& alpha,
+ const Int* A, BlasInt ALDim,
+ const Int* B, BlasInt BLDim,
+ const Int& beta,
+ Int* C, BlasInt CLDim );
+#ifdef EL_HAVE_QD
+template void Gemm
+( char transA, char transB,
+ BlasInt m, BlasInt n, BlasInt k,
+ const DoubleDouble& alpha,
+ const DoubleDouble* A, BlasInt ALDim,
+ const DoubleDouble* B, BlasInt BLDim,
+ const DoubleDouble& beta,
+ DoubleDouble* C, BlasInt CLDim );
+template void Gemm
+( char transA, char transB,
+ BlasInt m, BlasInt n, BlasInt k,
+ const QuadDouble& alpha,
+ const QuadDouble* A, BlasInt ALDim,
+ const QuadDouble* B, BlasInt BLDim,
+ const QuadDouble& beta,
+ QuadDouble* C, BlasInt CLDim );
+template void Gemm
+( char transA, char transB,
+ BlasInt m, BlasInt n, BlasInt k,
+ const Complex<DoubleDouble>& alpha,
+ const Complex<DoubleDouble>* A, BlasInt ALDim,
+ const Complex<DoubleDouble>* B, BlasInt BLDim,
+ const Complex<DoubleDouble>& beta,
+ Complex<DoubleDouble>* C, BlasInt CLDim );
+template void Gemm
+( char transA, char transB,
+ BlasInt m, BlasInt n, BlasInt k,
+ const Complex<QuadDouble>& alpha,
+ const Complex<QuadDouble>* A, BlasInt ALDim,
+ const Complex<QuadDouble>* B, BlasInt BLDim,
+ const Complex<QuadDouble>& beta,
+ Complex<QuadDouble>* C, BlasInt CLDim );
+#endif
+#ifdef EL_HAVE_QUAD
+template void Gemm
+( char transA, char transB,
+ BlasInt m, BlasInt n, BlasInt k,
+ const Quad& alpha,
+ const Quad* A, BlasInt ALDim,
+ const Quad* B, BlasInt BLDim,
+ const Quad& beta,
+ Quad* C, BlasInt CLDim );
+template void Gemm
+( char transA, char transB,
+ BlasInt m, BlasInt n, BlasInt k,
+ const Complex<Quad>& alpha,
+ const Complex<Quad>* A, BlasInt ALDim,
+ const Complex<Quad>* B, BlasInt BLDim,
+ const Complex<Quad>& beta,
+ Complex<Quad>* C, BlasInt CLDim );
+#endif
+#ifdef EL_HAVE_MPC
+template void Gemm
+( char transA, char transB,
+ BlasInt m, BlasInt n, BlasInt k,
+ const BigInt& alpha,
+ const BigInt* A, BlasInt ALDim,
+ const BigInt* B, BlasInt BLDim,
+ const BigInt& beta,
+ BigInt* C, BlasInt CLDim );
+template void Gemm
+( char transA, char transB,
+ BlasInt m, BlasInt n, BlasInt k,
+ const BigFloat& alpha,
+ const BigFloat* A, BlasInt ALDim,
+ const BigFloat* B, BlasInt BLDim,
+ const BigFloat& beta,
+ BigFloat* C, BlasInt CLDim );
+template void Gemm
+( char transA, char transB,
+ BlasInt m, BlasInt n, BlasInt k,
+ const Complex<BigFloat>& alpha,
+ const Complex<BigFloat>* A, BlasInt ALDim,
+ const Complex<BigFloat>* B, BlasInt BLDim,
+ const Complex<BigFloat>& beta,
+ Complex<BigFloat>* C, BlasInt CLDim );
+#endif
+
+void Gemm
+( char transA, char transB,
+ BlasInt m, BlasInt n, BlasInt k,
+ const float& alpha,
+ const float* A, BlasInt ALDim,
+ const float* B, BlasInt BLDim,
+ const float& beta,
+ float* C, BlasInt CLDim )
+{
+ EL_DEBUG_CSE
+ EL_DEBUG_ONLY(
+ if( std::toupper(transA) == 'N' )
+ {
+ if( ALDim < Max(m,1) )
+ LogicError("ALDim was too small: ALDim=",ALDim,",m=",m);
+ }
+ else
+ {
+ if( ALDim < Max(k,1) )
+ LogicError("ALDim was too small: ALDim=",ALDim,",k=",k);
+ }
+
+ if( std::toupper(transB) == 'N' )
+ {
+ if( BLDim < Max(k,1) )
+ LogicError("BLDim was too small: BLDim=",BLDim,",k=",k);
+ }
+ else
+ {
+ if( BLDim < Max(n,1) )
+ LogicError("BLDim was too small: BLDim=",BLDim,",n=",n);
+ }
+
+ if( CLDim < Max(m,1) )
+ LogicError("CLDim was too small: CLDim=",CLDim,",m=",m);
+ )
+ const char fixedTransA = ( std::toupper(transA) == 'C' ? 'T' : transA );
+ const char fixedTransB = ( std::toupper(transB) == 'C' ? 'T' : transB );
+
+ const mpi::Comm comm;
+ const Int commRank = mpi::Rank( comm );
+ if (commRank == 0) {
+ //printf("calling cublas Sgemm: m %d n %d k %d\n", m, n, k);
+ }
+
+ BlasInt rowA, colA, rowB, colB, rowC, colC;
+ // device memory size for A, B and C
+ BlasInt sizeA, sizeB, sizeC;
+ float *devA=NULL, *devB=NULL, *devC=NULL;
+
+ rowA = fixedTransA == 'T' ? k : m;
+ colA = fixedTransA == 'T' ? m : k;
+ rowB = fixedTransB == 'T' ? n : k;
+ colB = fixedTransB == 'T' ? k : n;
+ rowC = m;
+ colC = n;
+ sizeA = rowA * colA;
+ sizeB = rowB * colB;
+ sizeC = rowC * colC;
+
+ cublasStatus stat;
+
+#if USE_CUB
+ CubDebugExit(g_allocator.DeviceAllocate((void**)&devA,
+ sizeof(float) * (sizeA+sizeB+sizeC) ));
+#else
+ stat = cublasAlloc(sizeA+sizeB+sizeC, sizeof(float), (void **) &devA);
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("Alloc A,B,C error\n"); }
+#endif
+
+ devB = devA + sizeA;
+ devC = devB + sizeB;
+
+ // copy matrix A, B and C to device
+ stat = cublasSetMatrix(rowA, colA, sizeof(float), A, ALDim, devA, rowA);
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix A error\n"); }
+
+ stat = cublasSetMatrix(rowB, colB, sizeof(float), B, BLDim, devB, rowB);
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix B error\n"); }
+
+ if (beta != 0.0)
+ {
+ stat = cublasSetMatrix(rowC, colC, sizeof(float), C, CLDim, devC, rowC);
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix C error\n"); }
+ }
+
+ // cublas<t>gemm
+ cublasSgemm
+ ( fixedTransA, fixedTransB, m, n, k,
+ alpha, devA, rowA, devB, rowB, beta, devC, rowC );
+
+ // copy matrix C to host
+ stat = cublasGetMatrix(rowC, colC, sizeof(float), devC, rowC, C, CLDim);
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("GetMatrix C error\n"); }
+
+ // free
+#if USE_CUB
+ CubDebugExit(g_allocator.DeviceFree(devA));
+#else
+ cublasFree(devA);
+#endif
+ //printf("CUBLAS float done ...\n");
+}
+
+void Gemm
+( char transA, char transB,
+ BlasInt m, BlasInt n, BlasInt k,
+ const double& alpha,
+ const double* A, BlasInt ALDim,
+ const double* B, BlasInt BLDim,
+ const double& beta,
+ double* C, BlasInt CLDim )
+{
+ EL_DEBUG_CSE
+ EL_DEBUG_ONLY(
+ if( std::toupper(transA) == 'N' )
+ {
+ if( ALDim < Max(m,1) )
+ LogicError("ALDim was too small: ALDim=",ALDim,",m=",m);
+ }
+ else
+ {
+ if( ALDim < Max(k,1) )
+ LogicError("ALDim was too small: ALDim=",ALDim,",k=",k);
+ }
+
+ if( std::toupper(transB) == 'N' )
+ {
+ if( BLDim < Max(k,1) )
+ LogicError("BLDim was too small: BLDim=",BLDim,",k=",k);
+ }
+ else
+ {
+ if( BLDim < Max(n,1) )
+ LogicError("BLDim was too small: BLDim=",BLDim,",n=",n);
+ }
+
+ if( CLDim < Max(m,1) )
+ LogicError("CLDim was too small: CLDim=",CLDim,",m=",m);
+ )
+ const char fixedTransA = ( std::toupper(transA) == 'C' ? 'T' : transA );
+ const char fixedTransB = ( std::toupper(transB) == 'C' ? 'T' : transB );
+
+ const mpi::Comm comm;
+ const Int commRank = mpi::Rank( comm );
+ if (commRank == 0) {
+ //printf("calling cublas Dgemm: m %d n %d k %d\n", m, n, k);
+ }
+
+ BlasInt rowA, colA, rowB, colB, rowC, colC;
+ // device memory size for A, B and C
+ BlasInt sizeA, sizeB, sizeC;
+ double *devA=NULL, *devB=NULL, *devC=NULL;
+
+ rowA = fixedTransA == 'T' ? k : m;
+ colA = fixedTransA == 'T' ? m : k;
+ rowB = fixedTransB == 'T' ? n : k;
+ colB = fixedTransB == 'T' ? k : n;
+ rowC = m;
+ colC = n;
+ sizeA = rowA * colA;
+ sizeB = rowB * colB;
+ sizeC = rowC * colC;
+
+ cublasStatus stat;
+
+#if USE_CUB
+ CubDebugExit(g_allocator.DeviceAllocate((void**)&devA,
+ sizeof(double) * (sizeA+sizeB+sizeC) ));
+#else
+ stat = cublasAlloc(sizeA+sizeB+sizeC, sizeof(double), (void **) &devA);
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("Alloc A,B,C error\n"); }
+#endif
+
+ devB = devA + sizeA;
+ devC = devB + sizeB;
+
+ // copy matrix A, B and C to device
+ stat = cublasSetMatrix(rowA, colA, sizeof(double), A, ALDim, devA, rowA);
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix A error\n"); }
+
+ stat = cublasSetMatrix(rowB, colB, sizeof(double), B, BLDim, devB, rowB);
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix B error\n"); }
+
+ if (beta != 0.0)
+ {
+ stat = cublasSetMatrix(rowC, colC, sizeof(double), C, CLDim, devC, rowC);
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix C error\n"); }
+ }
+
+ // cublas<t>gemm
+ cublasDgemm
+ ( fixedTransA, fixedTransB, m, n, k,
+ alpha, devA, rowA, devB, rowB, beta, devC, rowC );
+
+ // copy matrix C to host
+ stat = cublasGetMatrix(rowC, colC, sizeof(double), devC, rowC, C, CLDim);
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("GetMatrix C error\n"); }
+
+ // free
+#if USE_CUB
+ CubDebugExit(g_allocator.DeviceFree(devA));
+#else
+ cublasFree(devA);
+#endif
+ //printf("CUBLAS double done ...\n");
+}
+
+void Gemm
+( char transA, char transB, BlasInt m, BlasInt n, BlasInt k,
+ const scomplex& alpha,
+ const scomplex* A, BlasInt ALDim,
+ const scomplex* B, BlasInt BLDim,
+ const scomplex& beta,
+ scomplex* C, BlasInt CLDim )
+{
+ EL_DEBUG_CSE
+ EL_DEBUG_ONLY(
+ if( std::toupper(transA) == 'N' )
+ {
+ if( ALDim < Max(m,1) )
+ LogicError("ALDim was too small: ALDim=",ALDim,",m=",m);
+ }
+ else
+ {
+ if( ALDim < Max(k,1) )
+ LogicError("ALDim was too small: ALDim=",ALDim,",k=",k);
+ }
+
+ if( std::toupper(transB) == 'N' )
+ {
+ if( BLDim < Max(k,1) )
+ LogicError("BLDim was too small: BLDim=",BLDim,",k=",k);
+ }
+ else
+ {
+ if( BLDim < Max(n,1) )
+ LogicError("BLDim was too small: BLDim=",BLDim,",n=",n);
+ }
+
+ if( CLDim < Max(m,1) )
+ LogicError("CLDim was too small: CLDim=",CLDim,",m=",m);
+ )
+
+ const char fixedTransA = transA;
+ const char fixedTransB = transB;
+
+ const mpi::Comm comm;
+ const Int commRank = mpi::Rank( comm );
+ if (commRank == 0) {
+ //printf("calling cublas Cgemm: m %d n %d k %d\n", m, n, k);
+ }
+
+ BlasInt rowA, colA, rowB, colB, rowC, colC;
+ // device memory size for A, B and C
+ BlasInt sizeA, sizeB, sizeC;
+ cuComplex *devA=NULL, *devB=NULL, *devC=NULL;
+
+ rowA = fixedTransA == 'T' ? k : m;
+ colA = fixedTransA == 'T' ? m : k;
+ rowB = fixedTransB == 'T' ? n : k;
+ colB = fixedTransB == 'T' ? k : n;
+ rowC = m;
+ colC = n;
+ sizeA = rowA * colA;
+ sizeB = rowB * colB;
+ sizeC = rowC * colC;
+
+ cublasStatus stat;
+ stat = cublasAlloc(sizeA+sizeB+sizeC, sizeof(cuComplex), (void **) &devA);
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("Alloc A,B,C error\n"); }
+
+ devB = devA + sizeA;
+ devC = devB + sizeB;
+
+ // copy matrix A, B and C to device
+ stat = cublasSetMatrix(rowA, colA, sizeof(cuComplex), A, ALDim, devA, rowA);
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix A error\n"); }
+
+ stat = cublasSetMatrix(rowB, colB, sizeof(cuComplex), B, BLDim, devB, rowB);
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix B error\n"); }
+
+ if (beta.real() != 0.0 || beta.imag() != 0.0)
+ {
+ stat = cublasSetMatrix(rowC, colC, sizeof(cuComplex), C, CLDim, devC, rowC);
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix C error\n"); }
+ }
+
+ // cublas<t>gemm
+ cublasCgemm
+ ( fixedTransA, fixedTransB, m, n, k,
+ *((cuComplex*) &alpha), devA, rowA, devB, rowB, *((cuComplex*) &beta), devC, rowC );
+
+ // copy matrix C to host
+ stat = cublasGetMatrix(rowC, colC, sizeof(cuComplex), devC, rowC, C, CLDim);
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("GetMatrix C error\n"); }
+
+ // free
+ cublasFree(devA);
+}
+
+void Gemm
+( char transA, char transB, BlasInt m, BlasInt n, BlasInt k,
+ const dcomplex& alpha,
+ const dcomplex* A, BlasInt ALDim,
+ const dcomplex* B, BlasInt BLDim,
+ const dcomplex& beta,
+ dcomplex* C, BlasInt CLDim )
+{
+ EL_DEBUG_CSE
+ EL_DEBUG_ONLY(
+ if( std::toupper(transA) == 'N' )
+ {
+ if( ALDim < Max(m,1) )
+ LogicError("ALDim was too small: ALDim=",ALDim,",m=",m);
+ }
+ else
+ {
+ if( ALDim < Max(k,1) )
+ LogicError("ALDim was too small: ALDim=",ALDim,",k=",k);
+ }
+
+ if( std::toupper(transB) == 'N' )
+ {
+ if( BLDim < Max(k,1) )
+ LogicError("BLDim was too small: BLDim=",BLDim,",k=",k);
+ }
+ else
+ {
+ if( BLDim < Max(n,1) )
+ LogicError("BLDim was too small: BLDim=",BLDim,",n=",n);
+ }
+
+ if( CLDim < Max(m,1) )
+ LogicError("CLDim was too small: CLDim=",CLDim,",m=",m);
+ )
+
+ const char fixedTransA = transA;
+ const char fixedTransB = transB;
+
+ const mpi::Comm comm;
+ const Int commRank = mpi::Rank( comm );
+ if (commRank == 0) {
+ //printf("calling cublas Zgemm: m %d n %d k %d\n", m, n, k);
+ }
+
+ BlasInt rowA, colA, rowB, colB, rowC, colC;
+ // device memory size for A, B and C
+ BlasInt sizeA, sizeB, sizeC;
+ cuDoubleComplex *devA=NULL, *devB=NULL, *devC=NULL;
+
+ rowA = fixedTransA == 'T' ? k : m;
+ colA = fixedTransA == 'T' ? m : k;
+ rowB = fixedTransB == 'T' ? n : k;
+ colB = fixedTransB == 'T' ? k : n;
+ rowC = m;
+ colC = n;
+ sizeA = rowA * colA;
+ sizeB = rowB * colB;
+ sizeC = rowC * colC;
+
+ cublasStatus stat;
+ stat = cublasAlloc(sizeA+sizeB+sizeC, sizeof(cuDoubleComplex), (void **) &devA);
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("Alloc A,B,C error\n"); }
+
+ devB = devA + sizeA;
+ devC = devB + sizeB;
+
+ // copy matrix A, B and C to device
+ stat = cublasSetMatrix(rowA, colA, sizeof(cuDoubleComplex), A, ALDim, devA, rowA);
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix A error\n"); }
+
+ stat = cublasSetMatrix(rowB, colB, sizeof(cuDoubleComplex), B, BLDim, devB, rowB);
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix B error\n"); }
+
+ if (beta.real() != 0.0 || beta.imag() != 0.0)
+ {
+ stat = cublasSetMatrix(rowC, colC, sizeof(cuDoubleComplex), C, CLDim, devC, rowC);
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix C error\n"); }
+ }
+
+ cublasZgemm
+ ( fixedTransA, fixedTransB, m, n, k,
+ *((cuDoubleComplex*) &alpha), devA, rowA, devB, rowB, *((cuDoubleComplex*) &beta),
+ devC, rowC );
+
+ // copy matrix C to host
+ stat = cublasGetMatrix(rowC, colC, sizeof(cuDoubleComplex), devC, rowC, C, CLDim);
+ if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("GetMatrix C error\n"); }
+
+ // free
+ cublasFree(devA);
+}
+
+} // namespace cublas
+} // namespace El
+
+#endif
+

View file

@ -33,6 +33,7 @@ class Elemental(CMakePackage):
homepage = "http://libelemental.org" homepage = "http://libelemental.org"
url = "https://github.com/elemental/Elemental/archive/v0.87.6.tar.gz" url = "https://github.com/elemental/Elemental/archive/v0.87.6.tar.gz"
version('master', git='https://github.com/elemental/Elemental.git', branch='master')
version('0.87.7', '6c1e7442021c59a36049e37ea69b8075') version('0.87.7', '6c1e7442021c59a36049e37ea69b8075')
version('0.87.6', '9fd29783d45b0a0e27c0df85f548abe9') version('0.87.6', '9fd29783d45b0a0e27c0df85f548abe9')
@ -52,6 +53,8 @@ class Elemental(CMakePackage):
description='Enable quad precision') description='Enable quad precision')
variant('int64', default=False, variant('int64', default=False,
description='Use 64bit integers') description='Use 64bit integers')
variant('cublas', default=False,
description='Enable cuBLAS for local BLAS operations')
# When this variant is set remove the normal dependencies since # When this variant is set remove the normal dependencies since
# Elemental has to build BLAS and ScaLAPACK internally # Elemental has to build BLAS and ScaLAPACK internally
variant('int64_blas', default=False, variant('int64_blas', default=False,
@ -62,15 +65,21 @@ class Elemental(CMakePackage):
variant('build_type', default='Release', variant('build_type', default='Release',
description='The build type to build', description='The build type to build',
values=('Debug', 'Release')) values=('Debug', 'Release'))
variant('blas', default='openblas', values=('openblas', 'mkl'),
description='Enable the use of OpenBlas/MKL')
# Note that this forces us to use OpenBLAS until #1712 is fixed # Note that #1712 forces us to enumerate the different blas variants
depends_on('blas', when='~openmp_blas ~int64_blas') depends_on('blas', when='~openmp_blas ~int64_blas')
# Hack to forward variant to openblas package # Hack to forward variant to openblas package
# Allow Elemental to build internally when using 8-byte ints # Allow Elemental to build internally when using 8-byte ints
depends_on('openblas +openmp', when='+openmp_blas ~int64_blas') depends_on('openblas +openmp', when='blas=openblas +openmp_blas ~int64_blas')
depends_on('intel-mkl', when="blas=mkl ~openmp_blas ~int64_blas")
depends_on('intel-mkl +openmp', when='blas=mkl +openmp_blas ~int64_blas')
depends_on('intel-mkl@2017.1 +openmp +ilp64', when='blas=mkl +openmp_blas +int64_blas')
# Note that this forces us to use OpenBLAS until #1712 is fixed # Note that this forces us to use OpenBLAS until #1712 is fixed
depends_on('lapack', when='~openmp_blas') depends_on('lapack', when='blas=openblas ~openmp_blas')
depends_on('metis') depends_on('metis')
depends_on('metis +int64', when='+int64') depends_on('metis +int64', when='+int64')
depends_on('mpi') depends_on('mpi')
@ -79,6 +88,8 @@ class Elemental(CMakePackage):
extends('python', when='+python') extends('python', when='+python')
depends_on('python@:2.8', when='+python') depends_on('python@:2.8', when='+python')
patch('elemental_cublas.patch', when='+cublas')
@property @property
def libs(self): def libs(self):
shared = True if '+shared' in self.spec else False shared = True if '+shared' in self.spec else False
@ -126,8 +137,7 @@ def cmake_args(self):
math_libs = spec['scalapack'].libs + math_libs math_libs = spec['scalapack'].libs + math_libs
args.extend([ args.extend([
'-DMATH_LIBS:STRING={0}'.format(math_libs.search_flags), '-DMATH_LIBS:STRING={0}'.format(math_libs.ld_flags)])
'-DMATH_LIBS:STRING={0}'.format(math_libs.link_flags)])
if '+python' in spec: if '+python' in spec:
args.extend([ args.extend([

View file

@ -39,37 +39,49 @@ class Lbann(CMakePackage):
variant('gpu', default=False, description='Builds with support for GPUs via CUDA and cuDNN') variant('gpu', default=False, description='Builds with support for GPUs via CUDA and cuDNN')
variant('opencv', default=True, description='Builds with support for image processing routines with OpenCV') variant('opencv', default=True, description='Builds with support for image processing routines with OpenCV')
variant('seq_init', default=False, description='Force serial initialization of weight matrices.') variant('seq_init', default=False, description='Force serial initialization of weight matrices.')
variant('dtype', default=4, description='Size (bits) of floating point representation for weights')
variant('build_type', default='Release',
description='The build type to build',
values=('Debug', 'Release'))
depends_on('elemental +openmp_blas +scalapack +shared +int64') depends_on('elemental +openmp_blas +scalapack +shared +int64')
depends_on('elemental +openmp_blas +scalapack +shared +int64 build_type=Debug',
when=('build_type=Debug'))
depends_on('cuda', when='+gpu') depends_on('cuda', when='+gpu')
depends_on('mpi') depends_on('mpi')
depends_on('opencv@3.2.0', when='+opencv') depends_on('opencv@3.2.0: +openmp +core +highgui +imgproc +jpeg +png +tiff +zlib', when='+opencv')
depends_on('protobuf@3.0.2:') depends_on('protobuf@3.0.2:')
depends_on('cnpy')
def cmake_args(self): def cmake_args(self):
spec = self.spec spec = self.spec
# Environment variables # Environment variables
CPPFLAGS = [] CPPFLAGS = []
CPPFLAGS.append('-DLBANN_SET_EL_RNG') CPPFLAGS.append('-DLBANN_SET_EL_RNG')
if '~seq_init' in spec:
CPPFLAGS.append('-DLBANN_PARALLEL_RANDOM_MATRICES') CPPFLAGS.append('-DLBANN_DATATYPE={0}'.format(
int(spec.variants['dtype'].value)))
args = [ args = [
'-DCMAKE_INSTALL_MESSAGE=LAZY', '-DCMAKE_INSTALL_MESSAGE=LAZY',
'-DCMAKE_CXX_FLAGS=%s' % ' '.join(CPPFLAGS), '-DCMAKE_CXX_FLAGS=%s' % ' '.join(CPPFLAGS),
'-DWITH_CUDA:BOOL=%s' % ('+gpu' in spec), '-DWITH_CUDA:BOOL=%s' % ('+gpu' in spec),
'-DWITH_CUDNN:BOOL=%s' % ('+gpu' in spec), '-DWITH_CUDNN:BOOL=%s' % ('+gpu' in spec),
'-DELEMENTAL_USE_CUBLAS:BOOL=%s' % (
'+cublas' in spec['elemental']),
'-DWITH_TBINF=OFF', '-DWITH_TBINF=OFF',
'-DWITH_VTUNE=OFF', '-DWITH_VTUNE=OFF',
'-DElemental_DIR={0}'.format(self.spec['elemental'].prefix), '-DElemental_DIR={0}'.format(spec['elemental'].prefix),
'-DCNPY_DIR={0}'.format(spec['cnpy'].prefix),
'-DELEMENTAL_MATH_LIBS={0}'.format( '-DELEMENTAL_MATH_LIBS={0}'.format(
self.spec['elemental'].libs), spec['elemental'].libs),
'-DSEQ_INIT:BOOL=%s' % ('+seq_init' in spec),
'-DVERBOSE=0', '-DVERBOSE=0',
'-DLBANN_HOME=.', '-DLBANN_HOME=.',
'-DLBANN_VER=spack'] '-DLBANN_VER=spack']
if '+opencv' in self.spec: if '+opencv' in spec:
args.extend(['-DOpenCV_DIR:STRING={0}'.format( args.extend(['-DOpenCV_DIR:STRING={0}'.format(
self.spec['opencv'].prefix)]) spec['opencv'].prefix)])
return args return args

View file

@ -35,6 +35,9 @@ class Libtiff(AutotoolsPackage):
version('4.0.6', 'd1d2e940dea0b5ad435f21f03d96dd72') version('4.0.6', 'd1d2e940dea0b5ad435f21f03d96dd72')
version('4.0.3', '051c1068e6a0627f461948c365290410') version('4.0.3', '051c1068e6a0627f461948c365290410')
depends_on('jpeg') variant('turbo', default=False, description='use libjpeg-turbo')
depends_on('jpeg', when='-turbo')
depends_on('libjpeg-turbo', when='+turbo')
depends_on('zlib') depends_on('zlib')
depends_on('xz') depends_on('xz')

View file

@ -42,8 +42,15 @@ class Opencv(CMakePackage):
homepage = 'http://opencv.org/' homepage = 'http://opencv.org/'
url = 'https://github.com/Itseez/opencv/archive/3.1.0.tar.gz' url = 'https://github.com/Itseez/opencv/archive/3.1.0.tar.gz'
version('master', git="https://github.com/opencv/opencv.git", branch="master")
version('3.2.0', 'a43b65488124ba33dde195fea9041b70') version('3.2.0', 'a43b65488124ba33dde195fea9041b70')
version('3.1.0', '70e1dd07f0aa06606f1bc0e3fa15abd3') version('3.1.0', '70e1dd07f0aa06606f1bc0e3fa15abd3')
version('2.4.13.2', 'fe52791ce523681a67036def4c25261b')
version('2.4.13.1', 'f6d354500d5013e60dc0fc44b07a63d1')
version('2.4.13', '8feb45a71adad89b8017a777477c3eff')
version('2.4.12.3', '2496a4a4caf8fecfbfc294fbe6a814b0')
version('2.4.12.2', 'bc0c60c2ea1cf4078deef99569912fc7')
version('2.4.12.1', '7192f51434710904b5e3594872b897c3')
variant('shared', default=True, variant('shared', default=True,
description='Enables the build of shared libraries') description='Enables the build of shared libraries')
@ -59,13 +66,21 @@ class Opencv(CMakePackage):
description='Enables the build of Python extensions') description='Enables the build of Python extensions')
variant('java', default=False, variant('java', default=False,
description='Activates support for Java') description='Activates support for Java')
variant('openmp', default=False, description='Activates support for OpenMP threads')
variant('core', default=False, description='Include opencv_core module into the OpenCV build')
variant('highgui', default=False, description='Include opencv_highgui module into the OpenCV build')
variant('imgproc', default=False, description='Include opencv_imgproc module into the OpenCV build')
variant('jpeg', default=False, description='Include JPEG support')
variant('png', default=False, description='Include PNG support')
variant('tiff', default=False, description='Include TIFF support')
variant('zlib', default=False, description='Build zlib from source')
depends_on('eigen', when='+eigen', type='build') depends_on('eigen', when='+eigen', type='build')
depends_on('zlib') depends_on('zlib', when='+zlib')
depends_on('libpng') depends_on('libpng', when='+png')
depends_on('libjpeg-turbo') depends_on('libjpeg-turbo', when='+jpeg')
depends_on('libtiff') depends_on('libtiff+turbo', when='+tiff')
depends_on('jasper', when='+jasper') depends_on('jasper', when='+jasper')
depends_on('cuda', when='+cuda') depends_on('cuda', when='+cuda')
@ -94,6 +109,22 @@ def cmake_args(self):
'ON' if '+vtk' in spec else 'OFF')), 'ON' if '+vtk' in spec else 'OFF')),
'-DBUILD_opencv_java:BOOL={0}'.format(( '-DBUILD_opencv_java:BOOL={0}'.format((
'ON' if '+java' in spec else 'OFF')), 'ON' if '+java' in spec else 'OFF')),
'-DBUILD_opencv_core:BOOL={0}'.format((
'ON' if '+core' in spec else 'OFF')),
'-DBUILD_opencv_highgui:BOOL={0}'.format((
'ON' if '+highgui' in spec else 'OFF')),
'-DBUILD_opencv_imgproc:BOOL={0}'.format((
'ON' if '+imgproc' in spec else 'OFF')),
'-DWITH_JPEG:BOOL={0}'.format((
'ON' if '+jpeg' in spec else 'OFF')),
'-DWITH_PNG:BOOL={0}'.format((
'ON' if '+png' in spec else 'OFF')),
'-DWITH_TIFF:BOOL={0}'.format((
'ON' if '+tiff' in spec else 'OFF')),
'-DWITH_ZLIB:BOOL={0}'.format((
'ON' if '+zlib' in spec else 'OFF')),
'-DWITH_OPENMP:BOOL={0}'.format((
'ON' if '+openmp' in spec else 'OFF')),
] ]
# Media I/O # Media I/O
@ -115,16 +146,19 @@ def cmake_args(self):
'-DPNG_INCLUDE_DIR:PATH={0}'.format(libpng.prefix.include) '-DPNG_INCLUDE_DIR:PATH={0}'.format(libpng.prefix.include)
]) ])
if '+jpeg' in spec:
libjpeg = spec['libjpeg-turbo'] libjpeg = spec['libjpeg-turbo']
args.extend([ cmake_options.extend([
'-DBUILD_JPEG:BOOL=OFF',
'-DJPEG_LIBRARY:FILEPATH={0}'.format( '-DJPEG_LIBRARY:FILEPATH={0}'.format(
join_path(libjpeg.prefix.lib, join_path(libjpeg.prefix.lib,
'libjpeg.{0}'.format(dso_suffix))), 'libjpeg.{0}'.format(dso_suffix))),
'-DJPEG_INCLUDE_DIR:PATH={0}'.format(libjpeg.prefix.include) '-DJPEG_INCLUDE_DIR:PATH={0}'.format(libjpeg.prefix.include)
]) ])
if '+tiff' in spec:
libtiff = spec['libtiff'] libtiff = spec['libtiff']
args.extend([ cmake_options.extend([
'-DTIFF_LIBRARY_{0}:FILEPATH={1}'.format(( '-DTIFF_LIBRARY_{0}:FILEPATH={1}'.format((
'DEBUG' if '+debug' in spec else 'RELEASE'), 'DEBUG' if '+debug' in spec else 'RELEASE'),
join_path(libtiff.prefix.lib, join_path(libtiff.prefix.lib,
@ -132,8 +166,9 @@ def cmake_args(self):
'-DTIFF_INCLUDE_DIR:PATH={0}'.format(libtiff.prefix.include) '-DTIFF_INCLUDE_DIR:PATH={0}'.format(libtiff.prefix.include)
]) ])
if '+jasper' in spec:
jasper = spec['jasper'] jasper = spec['jasper']
args.extend([ cmake_options.extend([
'-DJASPER_LIBRARY_{0}:FILEPATH={1}'.format(( '-DJASPER_LIBRARY_{0}:FILEPATH={1}'.format((
'DEBUG' if '+debug' in spec else 'RELEASE'), 'DEBUG' if '+debug' in spec else 'RELEASE'),
join_path(jasper.prefix.lib, join_path(jasper.prefix.lib,