Elemental cublas (#4889)

* Added a package for the MDAnalysis toolkit. * Added a patch that allows Elemental to use cuBLAS internally. * Added support for LBANN to use the new cuBLAS extension in Elemental. * Added a proper variant for when LBANN does not want to use cuBLAS in elemental. * Added a package for the cnpy project and used it in the lbann package. * Removed unnecessary comment lines. * Removed blank lines * Removed debug variant * Add support for libjpeg-turbo * Added additional variants for OpenCV features. Fixed bug when linking in TIFF support, where libtiff used the regular JPEG library and OpenCV used libjpeg-turbo. Now libtiff can use libjpeg-turbo. * Removed the variant for getting Elemental to use the cublas variant. Updated the requirements for OpenCV to add new options. * Fixed a flake8 error in OpenCV and added a path to find cnpy in lbann. * Fixed line too long flake8 error. * Added a flag to specify the datatype size in lbann and fixed a flake8 error. * Added a debug build variant using hte new build_type * Fixed flake8 * Fixed how the debug build is pushed to Elemental * Fixed a bug in the Elemental package where the blas search flags were being overridden by the blas link flags. Changed how the sequential initialization variant is implemented in LBANN. * Added support via a variant to explicitly use mkl or openblas. This helps work around variant forwarding problems. * Updated package files to address pull request comments.
2017-08-07 11:41:13 -07:00 · 2017-08-07 11:41:13 -07:00 · 8ca7c77008
commit 8ca7c77008
parent 755081968f
6 changed files with 804 additions and 42 deletions
--- a/var/spack/repos/builtin/packages/cnpy/package.py
+++ b/var/spack/repos/builtin/packages/cnpy/package.py
@ -0,0 +1,34 @@
 ##############################################################################
 # Copyright (c) 2013-2016, Lawrence Livermore National Security, LLC.
 # Produced at the Lawrence Livermore National Laboratory.
 #
 # This file is part of Spack.
 # Created by Todd Gamblin, tgamblin@llnl.gov, All rights reserved.
 # LLNL-CODE-647188
 #
 # For details, see https://github.com/llnl/spack
 # Please also see the NOTICE and LICENSE files for our notice and the LGPL.
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License (as
 # published by the Free Software Foundation) version 2.1, February 1999.
 #
 # This program is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the IMPLIED WARRANTY OF
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the terms and
 # conditions of the GNU Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
 # License along with this program; if not, write to the Free Software
 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 ##############################################################################
 from spack import *
 class Cnpy(CMakePackage):
    """cnpy: library to read/write .npy and .npz files in C/C++."""
    homepage = "https://github.com/rogersce/cnpy"
    url      = "https://github.com/rogersce/cnpy"
    version('master', git='https://github.com/rogersce/cnpy.git', branch="master")
--- a/var/spack/repos/builtin/packages/elemental/elemental_cublas.patch
+++ b/var/spack/repos/builtin/packages/elemental/elemental_cublas.patch
@ -0,0 +1,668 @@
 diff -Naur a/include/El/blas_like/level3.hpp b/include/El/blas_like/level3.hpp
 --- a/include/El/blas_like/level3.hpp	2017-06-08 07:30:43.180249917 -0700
 +++ b/include/El/blas_like/level3.hpp	2017-06-08 07:35:27.325434602 -0700
@@ -31,6 +31,10 @@
 }
 using namespace GemmAlgorithmNS;
 +void GemmUseGPU(int min_M, int min_N, int min_K);
 +
 +void GemmUseCPU();
 +
 template<typename T>
 void Gemm
 ( Orientation orientA, Orientation orientB,
 diff -Naur a/include/El/core/imports/blas.hpp b/include/El/core/imports/blas.hpp
 --- a/include/El/core/imports/blas.hpp	2017-06-08 07:30:43.522016908 -0700
 +++ b/include/El/core/imports/blas.hpp	2017-06-08 07:35:06.834030908 -0700
@@ -916,4 +916,63 @@
 } // namespace blas
 } // namespace El
 +
 +#if defined(EL_USE_CUBLAS)
 +
 +namespace El {
 +
 +#ifdef EL_USE_64BIT_BLAS_INTS
 +typedef long long int BlasInt;
 +#else
 +typedef int BlasInt;
 +#endif
 +
 +namespace cublas {
 +
 +// NOTE: templated routines are custom and not wrappers
 +
 +// Level 3 BLAS
 +// ============
 +template<typename T>
 +void Gemm
 +( char transA, char transB, BlasInt m, BlasInt n, BlasInt k,
 +  const T& alpha,
 +  const T* A, BlasInt ALDim, 
 +  const T* B, BlasInt BLDim,
 +  const T& beta,
 +        T* C, BlasInt CLDim );
 +
 +void Gemm
 +( char transA, char transB, BlasInt m, BlasInt n, BlasInt k,
 +  const float& alpha,
 +  const float* A, BlasInt ALDim, 
 +  const float* B, BlasInt BLDim,
 +  const float& beta,
 +        float* C, BlasInt CLDim );
 +void Gemm
 +( char transA, char transB, BlasInt m, BlasInt n, BlasInt k,
 +  const double& alpha,
 +  const double* A, BlasInt ALDim, 
 +  const double* B, BlasInt BLDim,
 +  const double& beta,
 +        double* C, BlasInt CLDim );
 +void Gemm
 +( char transA, char transB, BlasInt m, BlasInt n, BlasInt k,
 +  const scomplex& alpha,
 +  const scomplex* A, BlasInt ALDim, 
 +  const scomplex* B, BlasInt BLDim,
 +  const scomplex& beta,
 +        scomplex* C, BlasInt CLDim );
 +void Gemm
 +( char transA, char transB, BlasInt m, BlasInt n, BlasInt k,
 +  const dcomplex& alpha,
 +  const dcomplex* A, BlasInt ALDim, 
 +  const dcomplex* B, BlasInt BLDim,
 +  const dcomplex& beta,
 +        dcomplex* C, BlasInt CLDim );
 +
 +} // namespace cublas
 +} // namespace El
 +#endif
 +
 #endif // ifndef EL_IMPORTS_BLAS_DECL_HPP
 diff -Naur a/src/blas_like/level3/Gemm.cpp b/src/blas_like/level3/Gemm.cpp
 --- a/src/blas_like/level3/Gemm.cpp	2017-06-08 07:30:44.307096427 -0700
 +++ b/src/blas_like/level3/Gemm.cpp	2017-06-08 07:34:23.062863489 -0700
@@ -16,6 +16,20 @@
 namespace El {
 +char gemm_cpu_gpu_switch = 'c';
 +int min_M = 0, min_N = 0, min_K = 0;
 +
 +void GemmUseGPU(int _min_M, int _min_N, int _min_K) {
 +   gemm_cpu_gpu_switch = 'g';
 +   min_M = _min_M;
 +   min_N = _min_N;
 +   min_K = _min_K;
 +}
 +
 +void GemmUseCPU() {
 +   gemm_cpu_gpu_switch = 'c';
 +}
 +
 template<typename T>
 void Gemm
 ( Orientation orientA, Orientation orientB,
@@ -59,11 +73,30 @@
     const Int k = ( orientA == NORMAL ? A.Width() : A.Height() );
     if( k != 0 )
     {
 +#if defined(EL_USE_CUBLAS)
 +        if (gemm_cpu_gpu_switch == 'g' && 
 +            m >= min_M &&
 +            n >= min_N &&
 +            k >= min_K) {
 +          cublas::Gemm
 +          ( transA, transB, m, n, k,
 +            alpha, A.LockedBuffer(), A.LDim(),
 +                   B.LockedBuffer(), B.LDim(),
 +            beta,  C.Buffer(),       C.LDim() );
 +        } else {
 +          blas::Gemm
 +          ( transA, transB, m, n, k,
 +            alpha, A.LockedBuffer(), A.LDim(),
 +                   B.LockedBuffer(), B.LDim(),
 +            beta,  C.Buffer(),       C.LDim() );
 +        }
 +#else
         blas::Gemm
         ( transA, transB, m, n, k,
           alpha, A.LockedBuffer(), A.LDim(),
                  B.LockedBuffer(), B.LDim(),
           beta,  C.Buffer(),       C.LDim() );
 +#endif
     }
     else
     {
 diff -Naur a/src/core/imports/blas/Gemm.hpp b/src/core/imports/blas/Gemm.hpp
 --- a/src/core/imports/blas/Gemm.hpp	2017-06-08 07:30:45.090529967 -0700
 +++ b/src/core/imports/blas/Gemm.hpp	2017-06-08 07:34:46.503009958 -0700
@@ -41,6 +41,12 @@
 } // extern "C"
 +
 +#if defined(EL_USE_CUBLAS)
 +#include <cublas.h>
 +#include <cub/util_allocator.cuh>
 +#endif
 +
 namespace El {
 namespace blas {
@@ -515,3 +521,515 @@
 } // namespace blas
 } // namespace El
 +
 +
 +#if EL_USE_CUBLAS
 +
 +#define USE_CUB 1
 +
 +namespace El {
 +namespace cublas {
 +
 +#if USE_CUB
 +cub::CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory
 +#endif
 +
 +template<typename T>
 +void Gemm
 +( char transA, char transB,
 +  BlasInt m, BlasInt n, BlasInt k,
 +  const T& alpha,
 +  const T* A, BlasInt ALDim,
 +  const T* B, BlasInt BLDim,
 +  const T& beta,
 +        T* C, BlasInt CLDim )
 +{
 +   // put something here
 +    printf("integer version \n");
 +}
 +template void Gemm
 +( char transA, char transB,
 +  BlasInt m, BlasInt n, BlasInt k, 
 +  const Int& alpha,
 +  const Int* A, BlasInt ALDim,
 +  const Int* B, BlasInt BLDim,
 +  const Int& beta,
 +        Int* C, BlasInt CLDim );
 +#ifdef EL_HAVE_QD
 +template void Gemm
 +( char transA, char transB,
 +  BlasInt m, BlasInt n, BlasInt k, 
 +  const DoubleDouble& alpha,
 +  const DoubleDouble* A, BlasInt ALDim,
 +  const DoubleDouble* B, BlasInt BLDim,
 +  const DoubleDouble& beta,
 +        DoubleDouble* C, BlasInt CLDim );
 +template void Gemm
 +( char transA, char transB,
 +  BlasInt m, BlasInt n, BlasInt k, 
 +  const QuadDouble& alpha,
 +  const QuadDouble* A, BlasInt ALDim,
 +  const QuadDouble* B, BlasInt BLDim,
 +  const QuadDouble& beta,
 +        QuadDouble* C, BlasInt CLDim );
 +template void Gemm
 +( char transA, char transB,
 +  BlasInt m, BlasInt n, BlasInt k, 
 +  const Complex<DoubleDouble>& alpha,
 +  const Complex<DoubleDouble>* A, BlasInt ALDim,
 +  const Complex<DoubleDouble>* B, BlasInt BLDim,
 +  const Complex<DoubleDouble>& beta,
 +        Complex<DoubleDouble>* C, BlasInt CLDim );
 +template void Gemm
 +( char transA, char transB,
 +  BlasInt m, BlasInt n, BlasInt k, 
 +  const Complex<QuadDouble>& alpha,
 +  const Complex<QuadDouble>* A, BlasInt ALDim,
 +  const Complex<QuadDouble>* B, BlasInt BLDim,
 +  const Complex<QuadDouble>& beta,
 +        Complex<QuadDouble>* C, BlasInt CLDim );
 +#endif
 +#ifdef EL_HAVE_QUAD
 +template void Gemm
 +( char transA, char transB,
 +  BlasInt m, BlasInt n, BlasInt k, 
 +  const Quad& alpha,
 +  const Quad* A, BlasInt ALDim,
 +  const Quad* B, BlasInt BLDim,
 +  const Quad& beta,
 +        Quad* C, BlasInt CLDim );
 +template void Gemm
 +( char transA, char transB,
 +  BlasInt m, BlasInt n, BlasInt k, 
 +  const Complex<Quad>& alpha,
 +  const Complex<Quad>* A, BlasInt ALDim, 
 +  const Complex<Quad>* B, BlasInt BLDim,
 +  const Complex<Quad>& beta,
 +        Complex<Quad>* C, BlasInt CLDim );
 +#endif
 +#ifdef EL_HAVE_MPC
 +template void Gemm
 +( char transA, char transB,
 +  BlasInt m, BlasInt n, BlasInt k, 
 +  const BigInt& alpha,
 +  const BigInt* A, BlasInt ALDim,
 +  const BigInt* B, BlasInt BLDim,
 +  const BigInt& beta,
 +        BigInt* C, BlasInt CLDim );
 +template void Gemm
 +( char transA, char transB,
 +  BlasInt m, BlasInt n, BlasInt k, 
 +  const BigFloat& alpha,
 +  const BigFloat* A, BlasInt ALDim,
 +  const BigFloat* B, BlasInt BLDim,
 +  const BigFloat& beta,
 +        BigFloat* C, BlasInt CLDim );
 +template void Gemm
 +( char transA, char transB,
 +  BlasInt m, BlasInt n, BlasInt k, 
 +  const Complex<BigFloat>& alpha,
 +  const Complex<BigFloat>* A, BlasInt ALDim,
 +  const Complex<BigFloat>* B, BlasInt BLDim,
 +  const Complex<BigFloat>& beta,
 +        Complex<BigFloat>* C, BlasInt CLDim );
 +#endif
 +
 +void Gemm
 +( char transA, char transB,
 +  BlasInt m, BlasInt n, BlasInt k, 
 +  const float& alpha,
 +  const float* A, BlasInt ALDim,
 +  const float* B, BlasInt BLDim,
 +  const float& beta,
 +        float* C, BlasInt CLDim )
 +{
 +    EL_DEBUG_CSE
 +    EL_DEBUG_ONLY(
 +      if( std::toupper(transA) == 'N' )
 +      {
 +          if( ALDim < Max(m,1) )
 +              LogicError("ALDim was too small: ALDim=",ALDim,",m=",m);
 +      }
 +      else
 +      {
 +          if( ALDim < Max(k,1) )
 +              LogicError("ALDim was too small: ALDim=",ALDim,",k=",k);
 +      }
 +
 +      if( std::toupper(transB) == 'N' )
 +      {
 +          if( BLDim < Max(k,1) )
 +              LogicError("BLDim was too small: BLDim=",BLDim,",k=",k);
 +      }
 +      else
 +      {
 +          if( BLDim < Max(n,1) )
 +              LogicError("BLDim was too small: BLDim=",BLDim,",n=",n);
 +      }
 +
 +      if( CLDim < Max(m,1) )
 +          LogicError("CLDim was too small: CLDim=",CLDim,",m=",m);
 +    )
 +    const char fixedTransA = ( std::toupper(transA) == 'C' ? 'T' : transA );
 +    const char fixedTransB = ( std::toupper(transB) == 'C' ? 'T' : transB );
 + 
 +    const mpi::Comm comm;
 +    const Int commRank = mpi::Rank( comm );
 +    if (commRank == 0) {
 +       //printf("calling cublas Sgemm: m %d n %d k %d\n", m, n, k);
 +    }
 +
 +    BlasInt rowA, colA, rowB, colB, rowC, colC;
 +    // device memory size for A, B and C
 +    BlasInt sizeA, sizeB, sizeC;
 +    float *devA=NULL, *devB=NULL, *devC=NULL;
 +    
 +    rowA = fixedTransA == 'T' ? k : m;
 +    colA = fixedTransA == 'T' ? m : k;
 +    rowB = fixedTransB == 'T' ? n : k;
 +    colB = fixedTransB == 'T' ? k : n;
 +    rowC = m;
 +    colC = n;
 +    sizeA = rowA * colA;
 +    sizeB = rowB * colB;
 +    sizeC = rowC * colC;
 +
 +    cublasStatus stat;
 +    
 +#if USE_CUB
 +    CubDebugExit(g_allocator.DeviceAllocate((void**)&devA, 
 +                 sizeof(float) * (sizeA+sizeB+sizeC) ));
 +#else
 +    stat = cublasAlloc(sizeA+sizeB+sizeC, sizeof(float), (void **) &devA);
 +    if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("Alloc A,B,C error\n"); }
 +#endif
 +
 +    devB = devA + sizeA;
 +    devC = devB + sizeB;
 +
 +    // copy matrix A, B and C to device
 +    stat = cublasSetMatrix(rowA, colA, sizeof(float), A, ALDim, devA, rowA);
 +    if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix A error\n"); }
 +
 +    stat = cublasSetMatrix(rowB, colB, sizeof(float), B, BLDim, devB, rowB);
 +    if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix B error\n"); }
 +    
 +    if (beta != 0.0)
 +    {
 +       stat = cublasSetMatrix(rowC, colC, sizeof(float), C, CLDim, devC, rowC);
 +       if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix C error\n"); }
 +    }
 +    
 +    // cublas<t>gemm
 +    cublasSgemm
 +    ( fixedTransA, fixedTransB, m, n, k,
 +      alpha, devA, rowA, devB, rowB, beta, devC, rowC );
 +
 +    // copy matrix C to host
 +    stat = cublasGetMatrix(rowC, colC, sizeof(float), devC, rowC, C, CLDim);
 +    if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("GetMatrix C error\n"); }
 +
 +    // free
 +#if USE_CUB
 +    CubDebugExit(g_allocator.DeviceFree(devA));
 +#else
 +    cublasFree(devA);
 +#endif
 +    //printf("CUBLAS float done ...\n");
 +}
 +
 +void Gemm
 +( char transA, char transB,
 +  BlasInt m, BlasInt n, BlasInt k, 
 +  const double& alpha,
 +  const double* A, BlasInt ALDim, 
 +  const double* B, BlasInt BLDim,
 +  const double& beta,
 +        double* C, BlasInt CLDim )
 +{
 +    EL_DEBUG_CSE
 +    EL_DEBUG_ONLY(
 +      if( std::toupper(transA) == 'N' )
 +      {
 +          if( ALDim < Max(m,1) )
 +              LogicError("ALDim was too small: ALDim=",ALDim,",m=",m);
 +      }
 +      else
 +      {
 +          if( ALDim < Max(k,1) )
 +              LogicError("ALDim was too small: ALDim=",ALDim,",k=",k);
 +      }      
 +
 +      if( std::toupper(transB) == 'N' )
 +      {
 +          if( BLDim < Max(k,1) )
 +              LogicError("BLDim was too small: BLDim=",BLDim,",k=",k);
 +      }
 +      else
 +      {
 +          if( BLDim < Max(n,1) )
 +              LogicError("BLDim was too small: BLDim=",BLDim,",n=",n);
 +      }
 +
 +      if( CLDim < Max(m,1) )
 +          LogicError("CLDim was too small: CLDim=",CLDim,",m=",m);
 +    )
 +    const char fixedTransA = ( std::toupper(transA) == 'C' ? 'T' : transA );
 +    const char fixedTransB = ( std::toupper(transB) == 'C' ? 'T' : transB );
 +
 +    const mpi::Comm comm;
 +    const Int commRank = mpi::Rank( comm );
 +    if (commRank == 0) {
 +       //printf("calling cublas Dgemm: m %d n %d k %d\n", m, n, k);
 +    }
 +
 +    BlasInt rowA, colA, rowB, colB, rowC, colC;
 +    // device memory size for A, B and C
 +    BlasInt sizeA, sizeB, sizeC;
 +    double *devA=NULL, *devB=NULL, *devC=NULL;
 +    
 +    rowA = fixedTransA == 'T' ? k : m;
 +    colA = fixedTransA == 'T' ? m : k;
 +    rowB = fixedTransB == 'T' ? n : k;
 +    colB = fixedTransB == 'T' ? k : n;
 +    rowC = m;
 +    colC = n;
 +    sizeA = rowA * colA;
 +    sizeB = rowB * colB;
 +    sizeC = rowC * colC;
 +
 +    cublasStatus stat;
 +
 +#if USE_CUB
 +    CubDebugExit(g_allocator.DeviceAllocate((void**)&devA, 
 +                 sizeof(double) * (sizeA+sizeB+sizeC) ));
 +#else
 +    stat = cublasAlloc(sizeA+sizeB+sizeC, sizeof(double), (void **) &devA);
 +    if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("Alloc A,B,C error\n"); }
 +#endif
 +
 +    devB = devA + sizeA;
 +    devC = devB + sizeB;
 +
 +    // copy matrix A, B and C to device
 +    stat = cublasSetMatrix(rowA, colA, sizeof(double), A, ALDim, devA, rowA);
 +    if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix A error\n"); }
 +
 +    stat = cublasSetMatrix(rowB, colB, sizeof(double), B, BLDim, devB, rowB);
 +    if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix B error\n"); }
 +    
 +    if (beta != 0.0)
 +    {
 +       stat = cublasSetMatrix(rowC, colC, sizeof(double), C, CLDim, devC, rowC);
 +       if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix C error\n"); }
 +    }
 +
 +    // cublas<t>gemm
 +    cublasDgemm
 +    ( fixedTransA, fixedTransB, m, n, k,
 +      alpha, devA, rowA, devB, rowB, beta, devC, rowC );
 +    
 +    // copy matrix C to host
 +    stat = cublasGetMatrix(rowC, colC, sizeof(double), devC, rowC, C, CLDim);
 +    if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("GetMatrix C error\n"); }
 +
 +    // free
 +#if USE_CUB
 +    CubDebugExit(g_allocator.DeviceFree(devA));
 +#else
 +    cublasFree(devA);
 +#endif
 +    //printf("CUBLAS double done ...\n");
 +}
 +
 +void Gemm
 +( char transA, char transB, BlasInt m, BlasInt n, BlasInt k, 
 +  const scomplex& alpha,
 +  const scomplex* A, BlasInt ALDim, 
 +  const scomplex* B, BlasInt BLDim,
 +  const scomplex& beta,
 +        scomplex* C, BlasInt CLDim )
 +{
 +    EL_DEBUG_CSE
 +    EL_DEBUG_ONLY(
 +      if( std::toupper(transA) == 'N' )
 +      {
 +          if( ALDim < Max(m,1) )
 +              LogicError("ALDim was too small: ALDim=",ALDim,",m=",m);
 +      }
 +      else
 +      {
 +          if( ALDim < Max(k,1) )
 +              LogicError("ALDim was too small: ALDim=",ALDim,",k=",k);
 +      }      
 +
 +      if( std::toupper(transB) == 'N' )
 +      {
 +          if( BLDim < Max(k,1) )
 +              LogicError("BLDim was too small: BLDim=",BLDim,",k=",k);
 +      }
 +      else
 +      {
 +          if( BLDim < Max(n,1) )
 +              LogicError("BLDim was too small: BLDim=",BLDim,",n=",n);
 +      }
 +
 +      if( CLDim < Max(m,1) )
 +          LogicError("CLDim was too small: CLDim=",CLDim,",m=",m);
 +    )
 +        
 +    const char fixedTransA = transA;
 +    const char fixedTransB = transB;
 +    
 +    const mpi::Comm comm;
 +    const Int commRank = mpi::Rank( comm );
 +    if (commRank == 0) {
 +       //printf("calling cublas Cgemm: m %d n %d k %d\n", m, n, k);
 +    }
 +
 +    BlasInt rowA, colA, rowB, colB, rowC, colC;
 +    // device memory size for A, B and C
 +    BlasInt sizeA, sizeB, sizeC;
 +    cuComplex *devA=NULL, *devB=NULL, *devC=NULL;
 +    
 +    rowA = fixedTransA == 'T' ? k : m;
 +    colA = fixedTransA == 'T' ? m : k;
 +    rowB = fixedTransB == 'T' ? n : k;
 +    colB = fixedTransB == 'T' ? k : n;
 +    rowC = m;
 +    colC = n;
 +    sizeA = rowA * colA;
 +    sizeB = rowB * colB;
 +    sizeC = rowC * colC;
 +
 +    cublasStatus stat;
 +    stat = cublasAlloc(sizeA+sizeB+sizeC, sizeof(cuComplex), (void **) &devA);
 +    if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("Alloc A,B,C error\n"); }
 +
 +    devB = devA + sizeA;
 +    devC = devB + sizeB;
 +
 +    // copy matrix A, B and C to device
 +    stat = cublasSetMatrix(rowA, colA, sizeof(cuComplex), A, ALDim, devA, rowA);
 +    if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix A error\n"); }
 +
 +    stat = cublasSetMatrix(rowB, colB, sizeof(cuComplex), B, BLDim, devB, rowB);
 +    if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix B error\n"); }
 +    
 +    if (beta.real() != 0.0 || beta.imag() != 0.0)
 +    {
 +       stat = cublasSetMatrix(rowC, colC, sizeof(cuComplex), C, CLDim, devC, rowC);
 +       if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix C error\n"); }
 +    }
 +
 +    // cublas<t>gemm
 +    cublasCgemm
 +    ( fixedTransA, fixedTransB, m, n, k,
 +      *((cuComplex*) &alpha), devA, rowA, devB, rowB, *((cuComplex*) &beta), devC, rowC );
 +
 +    // copy matrix C to host
 +    stat = cublasGetMatrix(rowC, colC, sizeof(cuComplex), devC, rowC, C, CLDim);
 +    if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("GetMatrix C error\n"); }
 +
 +    // free
 +    cublasFree(devA);
 +}
 +
 +void Gemm
 +( char transA, char transB, BlasInt m, BlasInt n, BlasInt k, 
 +  const dcomplex& alpha,
 +  const dcomplex* A, BlasInt ALDim, 
 +  const dcomplex* B, BlasInt BLDim,
 +  const dcomplex& beta,
 +        dcomplex* C, BlasInt CLDim )
 +{
 +    EL_DEBUG_CSE
 +    EL_DEBUG_ONLY(
 +      if( std::toupper(transA) == 'N' )
 +      {
 +          if( ALDim < Max(m,1) )
 +              LogicError("ALDim was too small: ALDim=",ALDim,",m=",m);
 +      }
 +      else
 +      {
 +          if( ALDim < Max(k,1) )
 +              LogicError("ALDim was too small: ALDim=",ALDim,",k=",k);
 +      }      
 +
 +      if( std::toupper(transB) == 'N' )
 +      {
 +          if( BLDim < Max(k,1) )
 +              LogicError("BLDim was too small: BLDim=",BLDim,",k=",k);
 +      }
 +      else
 +      {
 +          if( BLDim < Max(n,1) )
 +              LogicError("BLDim was too small: BLDim=",BLDim,",n=",n);
 +      }
 +
 +      if( CLDim < Max(m,1) )
 +          LogicError("CLDim was too small: CLDim=",CLDim,",m=",m);
 +    )
 +
 +    const char fixedTransA = transA;
 +    const char fixedTransB = transB;
 +       
 +    const mpi::Comm comm;
 +    const Int commRank = mpi::Rank( comm );
 +    if (commRank == 0) {
 +       //printf("calling cublas Zgemm: m %d n %d k %d\n", m, n, k);
 +    }
 +
 +    BlasInt rowA, colA, rowB, colB, rowC, colC;
 +    // device memory size for A, B and C
 +    BlasInt sizeA, sizeB, sizeC;
 +    cuDoubleComplex *devA=NULL, *devB=NULL, *devC=NULL;
 +    
 +    rowA = fixedTransA == 'T' ? k : m;
 +    colA = fixedTransA == 'T' ? m : k;
 +    rowB = fixedTransB == 'T' ? n : k;
 +    colB = fixedTransB == 'T' ? k : n;
 +    rowC = m;
 +    colC = n;
 +    sizeA = rowA * colA;
 +    sizeB = rowB * colB;
 +    sizeC = rowC * colC;
 +
 +    cublasStatus stat;
 +    stat = cublasAlloc(sizeA+sizeB+sizeC, sizeof(cuDoubleComplex), (void **) &devA);
 +    if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("Alloc A,B,C error\n"); }
 +
 +    devB = devA + sizeA;
 +    devC = devB + sizeB;
 +
 +    // copy matrix A, B and C to device
 +    stat = cublasSetMatrix(rowA, colA, sizeof(cuDoubleComplex), A, ALDim, devA, rowA);
 +    if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix A error\n"); }
 +
 +    stat = cublasSetMatrix(rowB, colB, sizeof(cuDoubleComplex), B, BLDim, devB, rowB);
 +    if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix B error\n"); }
 +    
 +    if (beta.real() != 0.0 || beta.imag() != 0.0)
 +    {
 +       stat = cublasSetMatrix(rowC, colC, sizeof(cuDoubleComplex), C, CLDim, devC, rowC);
 +       if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("SetMatrix C error\n"); }
 +    }
 +
 +    cublasZgemm
 +    ( fixedTransA, fixedTransB, m, n, k,
 +      *((cuDoubleComplex*) &alpha), devA, rowA, devB, rowB, *((cuDoubleComplex*) &beta), 
 +      devC, rowC );
 +
 +    // copy matrix C to host
 +    stat = cublasGetMatrix(rowC, colC, sizeof(cuDoubleComplex), devC, rowC, C, CLDim);
 +    if (stat != CUBLAS_STATUS_SUCCESS) { RuntimeError("GetMatrix C error\n"); }
 +
 +    // free
 +    cublasFree(devA);
 +}
 +
 +} // namespace cublas
 +} // namespace El
 +
 +#endif
 +
--- a/var/spack/repos/builtin/packages/elemental/package.py
+++ b/var/spack/repos/builtin/packages/elemental/package.py
@ -33,6 +33,7 @@ class Elemental(CMakePackage):
    homepage = "http://libelemental.org"
    url      = "https://github.com/elemental/Elemental/archive/v0.87.6.tar.gz"
    version('master', git='https://github.com/elemental/Elemental.git', branch='master')
    version('0.87.7', '6c1e7442021c59a36049e37ea69b8075')
    version('0.87.6', '9fd29783d45b0a0e27c0df85f548abe9')
@ -52,6 +53,8 @@ class Elemental(CMakePackage):
            description='Enable quad precision')
    variant('int64', default=False,
            description='Use 64bit integers')
    variant('cublas', default=False,
            description='Enable cuBLAS for local BLAS operations')
    # When this variant is set remove the normal dependencies since
    # Elemental has to build BLAS and ScaLAPACK internally
    variant('int64_blas', default=False,
@ -62,15 +65,21 @@ class Elemental(CMakePackage):
    variant('build_type', default='Release',
            description='The build type to build',
            values=('Debug', 'Release'))
    variant('blas', default='openblas', values=('openblas', 'mkl'),
            description='Enable the use of OpenBlas/MKL')
-    # Note that this forces us to use OpenBLAS until #1712 is fixed
+    # Note that #1712 forces us to enumerate the different blas variants
    depends_on('blas', when='~openmp_blas ~int64_blas')
    # Hack to forward variant to openblas package
    # Allow Elemental to build internally when using 8-byte ints
-    depends_on('openblas +openmp', when='+openmp_blas ~int64_blas')
+    depends_on('openblas +openmp', when='blas=openblas +openmp_blas ~int64_blas')
    depends_on('intel-mkl', when="blas=mkl ~openmp_blas ~int64_blas")
    depends_on('intel-mkl +openmp', when='blas=mkl +openmp_blas ~int64_blas')
    depends_on('intel-mkl@2017.1 +openmp +ilp64', when='blas=mkl +openmp_blas +int64_blas')
    # Note that this forces us to use OpenBLAS until #1712 is fixed
-    depends_on('lapack', when='~openmp_blas')
+    depends_on('lapack', when='blas=openblas ~openmp_blas')
    depends_on('metis')
    depends_on('metis +int64', when='+int64')
    depends_on('mpi')
@ -79,6 +88,8 @@ class Elemental(CMakePackage):
    extends('python', when='+python')
    depends_on('python@:2.8', when='+python')
    patch('elemental_cublas.patch', when='+cublas')
    @property
    def libs(self):
        shared = True if '+shared' in self.spec else False
@ -126,8 +137,7 @@ def cmake_args(self):
                math_libs = spec['scalapack'].libs + math_libs
            args.extend([
-                '-DMATH_LIBS:STRING={0}'.format(math_libs.search_flags),
+                '-DMATH_LIBS:STRING={0}'.format(math_libs.ld_flags)])
                '-DMATH_LIBS:STRING={0}'.format(math_libs.link_flags)])
        if '+python' in spec:
            args.extend([
--- a/var/spack/repos/builtin/packages/lbann/package.py
+++ b/var/spack/repos/builtin/packages/lbann/package.py
@ -39,37 +39,49 @@ class Lbann(CMakePackage):
    variant('gpu', default=False, description='Builds with support for GPUs via CUDA and cuDNN')
    variant('opencv', default=True, description='Builds with support for image processing routines with OpenCV')
    variant('seq_init', default=False, description='Force serial initialization of weight matrices.')
    variant('dtype', default=4, description='Size (bits) of floating point representation for weights')
    variant('build_type', default='Release',
            description='The build type to build',
            values=('Debug', 'Release'))
    depends_on('elemental +openmp_blas +scalapack +shared +int64')
    depends_on('elemental +openmp_blas +scalapack +shared +int64 build_type=Debug', 
               when=('build_type=Debug'))
    depends_on('cuda', when='+gpu')
    depends_on('mpi')
-    depends_on('opencv@3.2.0', when='+opencv')
+    depends_on('opencv@3.2.0: +openmp +core +highgui +imgproc +jpeg +png +tiff +zlib', when='+opencv')
    depends_on('protobuf@3.0.2:')
    depends_on('cnpy')
    def cmake_args(self):
        spec = self.spec
        # Environment variables
        CPPFLAGS = []
        CPPFLAGS.append('-DLBANN_SET_EL_RNG')
-        if '~seq_init' in spec:
+
-            CPPFLAGS.append('-DLBANN_PARALLEL_RANDOM_MATRICES')
+        CPPFLAGS.append('-DLBANN_DATATYPE={0}'.format(
            int(spec.variants['dtype'].value)))
        args = [
            '-DCMAKE_INSTALL_MESSAGE=LAZY',
            '-DCMAKE_CXX_FLAGS=%s' % ' '.join(CPPFLAGS),
            '-DWITH_CUDA:BOOL=%s' % ('+gpu' in spec),
            '-DWITH_CUDNN:BOOL=%s' % ('+gpu' in spec),
            '-DELEMENTAL_USE_CUBLAS:BOOL=%s' % (
                '+cublas' in spec['elemental']),
            '-DWITH_TBINF=OFF',
            '-DWITH_VTUNE=OFF',
-            '-DElemental_DIR={0}'.format(self.spec['elemental'].prefix),
+            '-DElemental_DIR={0}'.format(spec['elemental'].prefix),
            '-DCNPY_DIR={0}'.format(spec['cnpy'].prefix),
            '-DELEMENTAL_MATH_LIBS={0}'.format(
-                self.spec['elemental'].libs),
+                spec['elemental'].libs),
            '-DSEQ_INIT:BOOL=%s' % ('+seq_init' in spec),
            '-DVERBOSE=0',
            '-DLBANN_HOME=.',
            '-DLBANN_VER=spack']
-        if '+opencv' in self.spec:
+        if '+opencv' in spec:
            args.extend(['-DOpenCV_DIR:STRING={0}'.format(
-                self.spec['opencv'].prefix)])
+                spec['opencv'].prefix)])
        return args
--- a/var/spack/repos/builtin/packages/libtiff/package.py
+++ b/var/spack/repos/builtin/packages/libtiff/package.py
@ -35,6 +35,9 @@ class Libtiff(AutotoolsPackage):
    version('4.0.6', 'd1d2e940dea0b5ad435f21f03d96dd72')
    version('4.0.3', '051c1068e6a0627f461948c365290410')
-    depends_on('jpeg')
+    variant('turbo', default=False, description='use libjpeg-turbo')
    depends_on('jpeg', when='-turbo')
    depends_on('libjpeg-turbo', when='+turbo')
    depends_on('zlib')
    depends_on('xz')
--- a/var/spack/repos/builtin/packages/opencv/package.py
+++ b/var/spack/repos/builtin/packages/opencv/package.py
@ -42,8 +42,15 @@ class Opencv(CMakePackage):
    homepage = 'http://opencv.org/'
    url = 'https://github.com/Itseez/opencv/archive/3.1.0.tar.gz'
    version('master', git="https://github.com/opencv/opencv.git", branch="master")
    version('3.2.0',    'a43b65488124ba33dde195fea9041b70')
    version('3.1.0',    '70e1dd07f0aa06606f1bc0e3fa15abd3')
    version('2.4.13.2', 'fe52791ce523681a67036def4c25261b')
    version('2.4.13.1', 'f6d354500d5013e60dc0fc44b07a63d1')
    version('2.4.13',   '8feb45a71adad89b8017a777477c3eff')
    version('2.4.12.3', '2496a4a4caf8fecfbfc294fbe6a814b0')
    version('2.4.12.2', 'bc0c60c2ea1cf4078deef99569912fc7')
    version('2.4.12.1', '7192f51434710904b5e3594872b897c3')
    variant('shared', default=True,
            description='Enables the build of shared libraries')
@ -59,13 +66,21 @@ class Opencv(CMakePackage):
            description='Enables the build of Python extensions')
    variant('java', default=False,
            description='Activates support for Java')
    variant('openmp', default=False, description='Activates support for OpenMP threads')
    variant('core', default=False, description='Include opencv_core module into the OpenCV build')
    variant('highgui', default=False, description='Include opencv_highgui module into the OpenCV build')
    variant('imgproc', default=False, description='Include opencv_imgproc module into the OpenCV build')
    variant('jpeg', default=False, description='Include JPEG support')
    variant('png', default=False, description='Include PNG support')
    variant('tiff', default=False, description='Include TIFF support')
    variant('zlib', default=False, description='Build zlib from source')
    depends_on('eigen', when='+eigen', type='build')
-    depends_on('zlib')
+    depends_on('zlib', when='+zlib')
-    depends_on('libpng')
+    depends_on('libpng', when='+png')
-    depends_on('libjpeg-turbo')
+    depends_on('libjpeg-turbo', when='+jpeg')
-    depends_on('libtiff')
+    depends_on('libtiff+turbo', when='+tiff')
    depends_on('jasper', when='+jasper')
    depends_on('cuda', when='+cuda')
@ -94,6 +109,22 @@ def cmake_args(self):
                'ON' if '+vtk' in spec else 'OFF')),
            '-DBUILD_opencv_java:BOOL={0}'.format((
                'ON' if '+java' in spec else 'OFF')),
            '-DBUILD_opencv_core:BOOL={0}'.format((
                'ON' if '+core' in spec else 'OFF')),
            '-DBUILD_opencv_highgui:BOOL={0}'.format((
                'ON' if '+highgui' in spec else 'OFF')),
            '-DBUILD_opencv_imgproc:BOOL={0}'.format((
                'ON' if '+imgproc' in spec else 'OFF')),
            '-DWITH_JPEG:BOOL={0}'.format((
                'ON' if '+jpeg' in spec else 'OFF')),
            '-DWITH_PNG:BOOL={0}'.format((
                'ON' if '+png' in spec else 'OFF')),
            '-DWITH_TIFF:BOOL={0}'.format((
                'ON' if '+tiff' in spec else 'OFF')),
            '-DWITH_ZLIB:BOOL={0}'.format((
                'ON' if '+zlib' in spec else 'OFF')),
            '-DWITH_OPENMP:BOOL={0}'.format((
                'ON' if '+openmp' in spec else 'OFF')),
        ]
        # Media I/O
@ -115,16 +146,19 @@ def cmake_args(self):
            '-DPNG_INCLUDE_DIR:PATH={0}'.format(libpng.prefix.include)
        ])
        if '+jpeg' in spec:
            libjpeg = spec['libjpeg-turbo']
-        args.extend([
+            cmake_options.extend([
                '-DBUILD_JPEG:BOOL=OFF',
                '-DJPEG_LIBRARY:FILEPATH={0}'.format(
                    join_path(libjpeg.prefix.lib,
                              'libjpeg.{0}'.format(dso_suffix))),
                '-DJPEG_INCLUDE_DIR:PATH={0}'.format(libjpeg.prefix.include)
            ])
        if '+tiff' in spec:
            libtiff = spec['libtiff']
-        args.extend([
+            cmake_options.extend([
                '-DTIFF_LIBRARY_{0}:FILEPATH={1}'.format((
                    'DEBUG' if '+debug' in spec else 'RELEASE'),
                    join_path(libtiff.prefix.lib,
@ -132,8 +166,9 @@ def cmake_args(self):
                '-DTIFF_INCLUDE_DIR:PATH={0}'.format(libtiff.prefix.include)
            ])
        if '+jasper' in spec:
            jasper = spec['jasper']
-        args.extend([
+            cmake_options.extend([
                '-DJASPER_LIBRARY_{0}:FILEPATH={1}'.format((
                    'DEBUG' if '+debug' in spec else 'RELEASE'),
                    join_path(jasper.prefix.lib,