forked from TASC/HLRS-OpenMP-GPU-2024
Exercises and slides
This commit is contained in:
commit
8307cf55bf
70 changed files with 5333 additions and 0 deletions
14
00-worksharing/Makefile
Normal file
14
00-worksharing/Makefile
Normal file
|
@ -0,0 +1,14 @@
|
|||
ALL_RECURSIVE=all-recursive clean-recursive
|
||||
|
||||
DIRS=$(shell ls -d *-* | sed '/_/d' 2>/dev/null)
|
||||
|
||||
all: $(CONFIG) all-recursive
|
||||
clean: $(CONFIG) clean-recursive
|
||||
|
||||
$(ALL_RECURSIVE):
|
||||
@failcom='exit 1';\
|
||||
target=`echo $@ | sed s/-recursive//`; \
|
||||
for subdir in $(DIRS); do \
|
||||
(cd $$subdir && $(MAKE) $$target) || eval $$failcom; \
|
||||
done;
|
||||
|
0
00-worksharing/bin/.gitdir
Normal file
0
00-worksharing/bin/.gitdir
Normal file
30
00-worksharing/bin/run-cholesky.sh
Executable file
30
00-worksharing/bin/run-cholesky.sh
Executable file
|
@ -0,0 +1,30 @@
|
|||
#!/bin/bash
|
||||
#SBATCH --job-name=cholesky
|
||||
#SBATCH --workdir=.
|
||||
#SBATCH --output=cholesky_%j.out
|
||||
#SBATCH --error=cholesky_%j.err
|
||||
#SBATCH --cpus-per-task=48
|
||||
#SBATCH --ntasks=1
|
||||
#SBATCH --time=00:30:00
|
||||
#SBATCH --qos=debug
|
||||
|
||||
export IFS=";"
|
||||
|
||||
THREADS="01;02;04;06;08;10;12;14;16;18;20;22;24;26;28;30;32;34;36;38;40;42;44;46;48"
|
||||
#THREADS="01;02;04;06;08;10;12"
|
||||
|
||||
#MSIZES="2048;4096;8192"
|
||||
MSIZES="8192"
|
||||
MSIZES="16384"
|
||||
MSIZES="2048"
|
||||
|
||||
BSIZES="256"
|
||||
|
||||
for MS in $MSIZES; do
|
||||
for BS in $BSIZES; do
|
||||
for threads in $THREADS; do
|
||||
OMP_NUM_THREADS=$threads ./cholesky-for $MS $BS 0
|
||||
OMP_NUM_THREADS=$threads ./cholesky-for-opt $MS $BS 0
|
||||
done
|
||||
done
|
||||
done
|
23
00-worksharing/cholesky-for-opt/Makefile
Normal file
23
00-worksharing/cholesky-for-opt/Makefile
Normal file
|
@ -0,0 +1,23 @@
|
|||
BIN_DIR=../bin
|
||||
PROGRAM=$(BIN_DIR)/cholesky-for-opt
|
||||
|
||||
MKL_HOME=$(MKLROOT)
|
||||
|
||||
CC=gcc
|
||||
CFLAGS=-O3 -std=gnu99 -I$(MKL_HOME)/include -fopenmp
|
||||
LDFLAGS=-L$(MKL_HOME)/lib/intel64 -lmkl_sequential -lmkl_core -lmkl_rt -lpthread -lm -fopenmp
|
||||
#VERBOSE=-DVERBOSE
|
||||
|
||||
all: $(PROGRAM)
|
||||
|
||||
$(PROGRAM): cholesky.c
|
||||
$(CC) $(CFLAGS) $(VERBOSE) -o $@ $^ $(LDFLAGS)
|
||||
|
||||
$(BIN_DIR):
|
||||
mkdir $@
|
||||
|
||||
clean:
|
||||
rm -rf $(PROGRAM) *.o
|
||||
|
||||
wipe: clean
|
||||
rm -rf *.out *.err
|
289
00-worksharing/cholesky-for-opt/cholesky.c
Normal file
289
00-worksharing/cholesky-for-opt/cholesky.c
Normal file
|
@ -0,0 +1,289 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <errno.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include <sys/time.h>
|
||||
#include <sys/times.h>
|
||||
|
||||
#include <math.h>
|
||||
#include <mkl.h>
|
||||
|
||||
#include "omp.h"
|
||||
|
||||
#if !defined(_OPENMP)
|
||||
int omp_get_max_threads() { return 1; }
|
||||
int omp_get_num_threads() { return 1; }
|
||||
#endif
|
||||
|
||||
void cholesky(int ts, int nt, double* Ah[nt][nt])
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Computing Cholesky Factorization: indirect blocked matrix...\n");
|
||||
#endif
|
||||
|
||||
for (int k = 0; k < nt; k++) {
|
||||
// Diagonal Block factorization: using LAPACK
|
||||
LAPACKE_dpotrf(LAPACK_COL_MAJOR, 'L', ts, Ah[k][k], ts);
|
||||
|
||||
// Triangular systems
|
||||
for (int i = k + 1; i < nt; i++) {
|
||||
cblas_dtrsm(CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, ts, ts, 1.0, Ah[k][k], ts, Ah[k][i], ts);
|
||||
}
|
||||
|
||||
// Update trailing matrix
|
||||
for (int i = k + 1; i < nt; i++) {
|
||||
for (int j = k + 1; j < i; j++) {
|
||||
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, ts, ts, ts, -1.0, Ah[k][i], ts, Ah[k][j], ts, 1.0, Ah[j][i], ts);
|
||||
}
|
||||
cblas_dsyrk(CblasColMajor, CblasLower, CblasNoTrans, ts, ts, -1.0, Ah[k][i], ts, 1.0, Ah[i][i], ts);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef VERBOSE
|
||||
printf("> ...end of Cholesky Factorization.\n");
|
||||
#endif
|
||||
}
|
||||
|
||||
float get_time()
|
||||
{
|
||||
static double gtod_ref_time_sec = 0.0;
|
||||
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, NULL);
|
||||
|
||||
// If this is the first invocation of through dclock(), then initialize the
|
||||
// "reference time" global variable to the seconds field of the tv struct.
|
||||
if (gtod_ref_time_sec == 0.0) gtod_ref_time_sec = (double) tv.tv_sec;
|
||||
|
||||
// Normalize the seconds field of the tv struct so that it is relative to the
|
||||
// "reference time" that was recorded during the first invocation of dclock().
|
||||
const double norm_sec = (double) tv.tv_sec - gtod_ref_time_sec;
|
||||
|
||||
// Compute the number of seconds since the reference time.
|
||||
const double t = norm_sec + tv.tv_usec * 1.0e-6;
|
||||
|
||||
return (float) t;
|
||||
}
|
||||
|
||||
// Robust Check the factorization of the matrix A2
|
||||
// Using directly Fortran services: dlacpy_, dtrmm, dlange_
|
||||
static int check_factorization(int N, double *A1, double *A2, int LDA, char uplo, double eps)
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Checking the Cholesky Factorization... \n");
|
||||
#endif
|
||||
char NORM = 'I', ALL = 'A', UP = 'U', LO = 'L', TR = 'T', NU = 'N', RI = 'R';
|
||||
|
||||
double *Residual = (double *) malloc(N*N*sizeof(double));
|
||||
double *L1 = (double *) malloc(N*N*sizeof(double));
|
||||
double *L2 = (double *) malloc(N*N*sizeof(double));
|
||||
double *work = (double *) malloc(N*sizeof(double));
|
||||
|
||||
memset((void*)L1, 0, N*N*sizeof(double));
|
||||
memset((void*)L2, 0, N*N*sizeof(double));
|
||||
|
||||
double alpha= 1.0;
|
||||
|
||||
dlacpy_(&ALL, &N, &N, A1, &LDA, Residual, &N);
|
||||
|
||||
/* Dealing with L'L or U'U */
|
||||
if (uplo == 'U'){
|
||||
dlacpy_(&UP, &N, &N, A2, &LDA, L1, &N);
|
||||
dlacpy_(&UP, &N, &N, A2, &LDA, L2, &N);
|
||||
dtrmm(&LO, &uplo, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
|
||||
} else{
|
||||
dlacpy_(&LO, &N, &N, A2, &LDA, L1, &N);
|
||||
dlacpy_(&LO, &N, &N, A2, &LDA, L2, &N);
|
||||
dtrmm(&RI, &LO, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
|
||||
}
|
||||
|
||||
/* Compute the Residual || A -L'L|| */
|
||||
for (int i = 0; i < N; i++)
|
||||
for (int j = 0; j < N; j++)
|
||||
Residual[j*N+i] = L2[j*N+i] - Residual[j*N+i];
|
||||
|
||||
double Rnorm = dlange_(&NORM, &N, &N, Residual, &N, work);
|
||||
double Anorm = dlange_(&NORM, &N, &N, A1, &N, work);
|
||||
|
||||
#ifdef VERBOSE
|
||||
printf("> - ||L'L-A||_oo/(||A||_oo.N.eps) = %e \n",Rnorm / (Anorm*N*eps));
|
||||
#endif
|
||||
|
||||
const int info_factorization = isnan(Rnorm/(Anorm*N*eps)) || isinf(Rnorm/(Anorm*N*eps)) || (Rnorm/(Anorm*N*eps) > 60.0);
|
||||
|
||||
#ifdef VERBOSE
|
||||
if ( info_factorization) printf("> - Factorization is suspicious!\n");
|
||||
else printf("> - Factorization is CORRECT!\n");
|
||||
#endif
|
||||
|
||||
free(Residual); free(L1); free(L2); free(work);
|
||||
|
||||
return info_factorization;
|
||||
}
|
||||
|
||||
void initialize_matrix(const int n, const int ts, double *matrix)
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Initializing matrix with random values...\n");
|
||||
#endif
|
||||
|
||||
int ISEED[4] = {0,0,0,1};
|
||||
int intONE=1;
|
||||
|
||||
for (int i = 0; i < n*n; i+=n) {
|
||||
dlarnv_(&intONE, &ISEED[0], &n, &matrix[i]);
|
||||
}
|
||||
|
||||
for (int i=0; i<n; i++) {
|
||||
for (int j=0; j<n; j++) {
|
||||
matrix[j*n+i] = matrix[j*n+i] + matrix[i*n+j];
|
||||
matrix[i*n+j] = matrix[j*n+i];
|
||||
}
|
||||
}
|
||||
|
||||
// Diagonal values
|
||||
for (int i = 0; i < n; i++) {
|
||||
matrix[i*n+i] += (double) n;
|
||||
}
|
||||
}
|
||||
|
||||
void gather_block(const int N, const int ts, double *Alin, double *A)
|
||||
{
|
||||
for (int i = 0; i < ts; i++)
|
||||
for (int j = 0; j < ts; j++) {
|
||||
A[i*ts + j] = Alin[i*N + j];
|
||||
}
|
||||
}
|
||||
|
||||
void scatter_block(const int N, const int ts, double *A, double *Alin)
|
||||
{
|
||||
for (int i = 0; i < ts; i++)
|
||||
for (int j = 0; j < ts; j++) {
|
||||
Alin[i*N + j] = A[i*ts + j];
|
||||
}
|
||||
}
|
||||
|
||||
void convert_to_blocks(const int ts, const int DIM, const int N, double Alin[N][N], double *A[DIM][DIM])
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Converting linear matrix to blocks...\n");
|
||||
#endif
|
||||
for (int i = 0; i < DIM; i++)
|
||||
for (int j = 0; j < DIM; j++) {
|
||||
gather_block ( N, ts, &Alin[i*ts][j*ts], A[i][j]);
|
||||
}
|
||||
}
|
||||
|
||||
void convert_to_linear(const int ts, const int DIM, const int N, double *A[DIM][DIM], double Alin[N][N])
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Converting blocked matrix to linear...\n");
|
||||
#endif
|
||||
for (int i = 0; i < DIM; i++)
|
||||
for (int j = 0; j < DIM; j++) {
|
||||
scatter_block ( N, ts, A[i][j], (double *) &Alin[i*ts][j*ts]);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
char *result[3] = {"n/a","pass","FAIL"};
|
||||
const double eps = pow(2.0, -53);
|
||||
|
||||
// If number of arguments is not 4, print help
|
||||
if ( argc != 4) {
|
||||
printf( "%s: matrix_size block_size check[0|1]?\n", argv[0] );
|
||||
exit( -1 );
|
||||
}
|
||||
|
||||
const int n = atoi(argv[1]); // matrix size
|
||||
const int ts = atoi(argv[2]); // tile size
|
||||
int check = atoi(argv[3]); // check result?
|
||||
|
||||
// Compute number of tiles
|
||||
const int nt = n / ts;
|
||||
assert((nt*ts) == n); // tile size should divide size
|
||||
|
||||
// Allocate matrix
|
||||
double * const matrix = (double *) malloc(n * n * sizeof(double));
|
||||
assert(matrix != NULL);
|
||||
|
||||
// Initialize matrix
|
||||
initialize_matrix(n, ts, matrix);
|
||||
|
||||
// Allocate original matrix, and duplicate it, for debugging purposes
|
||||
double * const original_matrix = (double *) malloc(n * n * sizeof(double));
|
||||
assert(original_matrix != NULL);
|
||||
|
||||
// Save a copy of the original matrix
|
||||
for (int i = 0; i < n * n; i++ ) {
|
||||
original_matrix[i] = matrix[i];
|
||||
}
|
||||
|
||||
// Set version description: Indirect blocked matrix
|
||||
const char *version = "I-Blocked matrix";
|
||||
|
||||
// Allocate blocked matrix
|
||||
double *Ah[nt][nt];
|
||||
for (int i = 0; i < nt; i++) {
|
||||
for (int j = 0; j < nt; j++) {
|
||||
Ah[i][j] = malloc(ts * ts * sizeof(double));
|
||||
assert(Ah[i][j] != NULL);
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------
|
||||
// Convert, compute (time), and re-convert
|
||||
// ---------------------------------------
|
||||
convert_to_blocks(ts, nt, n, (double(*)[n]) matrix, Ah);
|
||||
const float tref = get_time();
|
||||
cholesky(ts, nt, (double* (*)[nt]) Ah);
|
||||
const float time = get_time() - tref;
|
||||
convert_to_linear(ts, nt, n, Ah, (double (*)[n]) matrix);
|
||||
|
||||
// Free blocked matrix
|
||||
for (int i = 0; i < nt; i++) {
|
||||
for (int j = 0; j < nt; j++) {
|
||||
assert(Ah[i][j] != NULL);
|
||||
free(Ah[i][j]);
|
||||
}
|
||||
}
|
||||
|
||||
// Check result, if requested
|
||||
if ( check ) {
|
||||
const char uplo = 'L';
|
||||
if ( check_factorization( n, original_matrix, matrix, n, uplo, eps) ) check++;
|
||||
}
|
||||
|
||||
// Free original matrix, not needed anymore
|
||||
free(original_matrix);
|
||||
|
||||
// Compute GFLOPs (Not verified)
|
||||
float gflops = (((1.0 / 3.0) * n * n * n) / ((time) * 1.0e+9));
|
||||
|
||||
// Print results
|
||||
#ifdef VERBOSE
|
||||
printf( "\n" );
|
||||
printf( "============ CHOLESKY RESULTS ============\n" );
|
||||
printf( " test %s\n", argv[0]);
|
||||
printf( " version %s\n", version);
|
||||
printf( " matrix size: %dx%d\n", n, n);
|
||||
printf( " tile size: %dx%d\n", ts, ts);
|
||||
printf( " number of threads: %d\n", omp_get_max_threads());
|
||||
printf( " time (s): %f\n", time);
|
||||
printf( " performance (gflops): %f\n", gflops);
|
||||
printf( " check: %s\n", result[check]);
|
||||
printf( "==========================================\n" );
|
||||
#else
|
||||
printf("test, %s, version, %s, n, %d, ts, %d, num_threads, %d, gflops, %f, time, %f, check, %s\n", argv[0], version, n, ts, omp_get_max_threads(), gflops, time, result[check]);
|
||||
#endif
|
||||
|
||||
// Free matrix
|
||||
free(matrix);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
293
00-worksharing/cholesky-for-opt/solution/cholesky.c
Normal file
293
00-worksharing/cholesky-for-opt/solution/cholesky.c
Normal file
|
@ -0,0 +1,293 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <errno.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include <sys/time.h>
|
||||
#include <sys/times.h>
|
||||
|
||||
#include <math.h>
|
||||
#include <mkl.h>
|
||||
|
||||
#include "omp.h"
|
||||
|
||||
#if !defined(_OPENMP)
|
||||
int omp_get_max_threads() { return 1; }
|
||||
int omp_get_num_threads() { return 1; }
|
||||
#endif
|
||||
|
||||
void cholesky(int ts, int nt, double* Ah[nt][nt])
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Computing Cholesky Factorization: indirect blocked matrix...\n");
|
||||
#endif
|
||||
|
||||
for (int k = 0; k < nt; k++) {
|
||||
// Diagonal Block factorization: using LAPACK
|
||||
LAPACKE_dpotrf(LAPACK_COL_MAJOR, 'L', ts, Ah[k][k], ts);
|
||||
|
||||
// Triangular systems
|
||||
#pragma omp parallel for
|
||||
for (int i = k + 1; i < nt; i++) {
|
||||
cblas_dtrsm(CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, ts, ts, 1.0, Ah[k][k], ts, Ah[k][i], ts);
|
||||
}
|
||||
|
||||
// Update trailing matrix
|
||||
#pragma omp parallel
|
||||
for (int i = k + 1; i < nt; i++) {
|
||||
#pragma omp for schedule(dynamic, 1) nowait
|
||||
for (int j = k + 1; j < i; j++) {
|
||||
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, ts, ts, ts, -1.0, Ah[k][i], ts, Ah[k][j], ts, 1.0, Ah[j][i], ts);
|
||||
}
|
||||
#pragma omp single nowait
|
||||
cblas_dsyrk(CblasColMajor, CblasLower, CblasNoTrans, ts, ts, -1.0, Ah[k][i], ts, 1.0, Ah[i][i], ts);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef VERBOSE
|
||||
printf("> ...end of Cholesky Factorization.\n");
|
||||
#endif
|
||||
}
|
||||
|
||||
float get_time()
|
||||
{
|
||||
static double gtod_ref_time_sec = 0.0;
|
||||
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, NULL);
|
||||
|
||||
// If this is the first invocation of through dclock(), then initialize the
|
||||
// "reference time" global variable to the seconds field of the tv struct.
|
||||
if (gtod_ref_time_sec == 0.0) gtod_ref_time_sec = (double) tv.tv_sec;
|
||||
|
||||
// Normalize the seconds field of the tv struct so that it is relative to the
|
||||
// "reference time" that was recorded during the first invocation of dclock().
|
||||
const double norm_sec = (double) tv.tv_sec - gtod_ref_time_sec;
|
||||
|
||||
// Compute the number of seconds since the reference time.
|
||||
const double t = norm_sec + tv.tv_usec * 1.0e-6;
|
||||
|
||||
return (float) t;
|
||||
}
|
||||
|
||||
// Robust Check the factorization of the matrix A2
|
||||
// Using directly Fortran services: dlacpy_, dtrmm, dlange_
|
||||
static int check_factorization(int N, double *A1, double *A2, int LDA, char uplo, double eps)
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Checking the Cholesky Factorization... \n");
|
||||
#endif
|
||||
char NORM = 'I', ALL = 'A', UP = 'U', LO = 'L', TR = 'T', NU = 'N', RI = 'R';
|
||||
|
||||
double *Residual = (double *) malloc(N*N*sizeof(double));
|
||||
double *L1 = (double *) malloc(N*N*sizeof(double));
|
||||
double *L2 = (double *) malloc(N*N*sizeof(double));
|
||||
double *work = (double *) malloc(N*sizeof(double));
|
||||
|
||||
memset((void*)L1, 0, N*N*sizeof(double));
|
||||
memset((void*)L2, 0, N*N*sizeof(double));
|
||||
|
||||
double alpha= 1.0;
|
||||
|
||||
dlacpy_(&ALL, &N, &N, A1, &LDA, Residual, &N);
|
||||
|
||||
/* Dealing with L'L or U'U */
|
||||
if (uplo == 'U'){
|
||||
dlacpy_(&UP, &N, &N, A2, &LDA, L1, &N);
|
||||
dlacpy_(&UP, &N, &N, A2, &LDA, L2, &N);
|
||||
dtrmm(&LO, &uplo, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
|
||||
} else{
|
||||
dlacpy_(&LO, &N, &N, A2, &LDA, L1, &N);
|
||||
dlacpy_(&LO, &N, &N, A2, &LDA, L2, &N);
|
||||
dtrmm(&RI, &LO, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
|
||||
}
|
||||
|
||||
/* Compute the Residual || A -L'L|| */
|
||||
for (int i = 0; i < N; i++)
|
||||
for (int j = 0; j < N; j++)
|
||||
Residual[j*N+i] = L2[j*N+i] - Residual[j*N+i];
|
||||
|
||||
double Rnorm = dlange_(&NORM, &N, &N, Residual, &N, work);
|
||||
double Anorm = dlange_(&NORM, &N, &N, A1, &N, work);
|
||||
|
||||
#ifdef VERBOSE
|
||||
printf("> - ||L'L-A||_oo/(||A||_oo.N.eps) = %e \n",Rnorm / (Anorm*N*eps));
|
||||
#endif
|
||||
|
||||
const int info_factorization = isnan(Rnorm/(Anorm*N*eps)) || isinf(Rnorm/(Anorm*N*eps)) || (Rnorm/(Anorm*N*eps) > 60.0);
|
||||
|
||||
#ifdef VERBOSE
|
||||
if ( info_factorization) printf("> - Factorization is suspicious!\n");
|
||||
else printf("> - Factorization is CORRECT!\n");
|
||||
#endif
|
||||
|
||||
free(Residual); free(L1); free(L2); free(work);
|
||||
|
||||
return info_factorization;
|
||||
}
|
||||
|
||||
void initialize_matrix(const int n, const int ts, double *matrix)
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Initializing matrix with random values...\n");
|
||||
#endif
|
||||
|
||||
int ISEED[4] = {0,0,0,1};
|
||||
int intONE=1;
|
||||
|
||||
for (int i = 0; i < n*n; i+=n) {
|
||||
dlarnv_(&intONE, &ISEED[0], &n, &matrix[i]);
|
||||
}
|
||||
|
||||
for (int i=0; i<n; i++) {
|
||||
for (int j=0; j<n; j++) {
|
||||
matrix[j*n+i] = matrix[j*n+i] + matrix[i*n+j];
|
||||
matrix[i*n+j] = matrix[j*n+i];
|
||||
}
|
||||
}
|
||||
|
||||
// Diagonal values
|
||||
for (int i = 0; i < n; i++) {
|
||||
matrix[i*n+i] += (double) n;
|
||||
}
|
||||
}
|
||||
|
||||
void gather_block(const int N, const int ts, double *Alin, double *A)
|
||||
{
|
||||
for (int i = 0; i < ts; i++)
|
||||
for (int j = 0; j < ts; j++) {
|
||||
A[i*ts + j] = Alin[i*N + j];
|
||||
}
|
||||
}
|
||||
|
||||
void scatter_block(const int N, const int ts, double *A, double *Alin)
|
||||
{
|
||||
for (int i = 0; i < ts; i++)
|
||||
for (int j = 0; j < ts; j++) {
|
||||
Alin[i*N + j] = A[i*ts + j];
|
||||
}
|
||||
}
|
||||
|
||||
void convert_to_blocks(const int ts, const int DIM, const int N, double Alin[N][N], double *A[DIM][DIM])
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Converting linear matrix to blocks...\n");
|
||||
#endif
|
||||
for (int i = 0; i < DIM; i++)
|
||||
for (int j = 0; j < DIM; j++) {
|
||||
gather_block ( N, ts, &Alin[i*ts][j*ts], A[i][j]);
|
||||
}
|
||||
}
|
||||
|
||||
void convert_to_linear(const int ts, const int DIM, const int N, double *A[DIM][DIM], double Alin[N][N])
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Converting blocked matrix to linear...\n");
|
||||
#endif
|
||||
for (int i = 0; i < DIM; i++)
|
||||
for (int j = 0; j < DIM; j++) {
|
||||
scatter_block ( N, ts, A[i][j], (double *) &Alin[i*ts][j*ts]);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
char *result[3] = {"n/a","pass","FAIL"};
|
||||
const double eps = pow(2.0, -53);
|
||||
|
||||
// If number of arguments is not 4, print help
|
||||
if ( argc != 4) {
|
||||
printf( "%s: matrix_size block_size check[0|1]?\n", argv[0] );
|
||||
exit( -1 );
|
||||
}
|
||||
|
||||
const int n = atoi(argv[1]); // matrix size
|
||||
const int ts = atoi(argv[2]); // tile size
|
||||
int check = atoi(argv[3]); // check result?
|
||||
|
||||
// Compute number of tiles
|
||||
const int nt = n / ts;
|
||||
assert((nt*ts) == n); // tile size should divide size
|
||||
|
||||
// Allocate matrix
|
||||
double * const matrix = (double *) malloc(n * n * sizeof(double));
|
||||
assert(matrix != NULL);
|
||||
|
||||
// Initialize matrix
|
||||
initialize_matrix(n, ts, matrix);
|
||||
|
||||
// Allocate original matrix, and duplicate it, for debugging purposes
|
||||
double * const original_matrix = (double *) malloc(n * n * sizeof(double));
|
||||
assert(original_matrix != NULL);
|
||||
|
||||
// Save a copy of the original matrix
|
||||
for (int i = 0; i < n * n; i++ ) {
|
||||
original_matrix[i] = matrix[i];
|
||||
}
|
||||
|
||||
// Set version description: Indirect blocked matrix
|
||||
const char *version = "I-Blocked matrix";
|
||||
|
||||
// Allocate blocked matrix
|
||||
double *Ah[nt][nt];
|
||||
for (int i = 0; i < nt; i++) {
|
||||
for (int j = 0; j < nt; j++) {
|
||||
Ah[i][j] = malloc(ts * ts * sizeof(double));
|
||||
assert(Ah[i][j] != NULL);
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------
|
||||
// Convert, compute (time), and re-convert
|
||||
// ---------------------------------------
|
||||
convert_to_blocks(ts, nt, n, (double(*)[n]) matrix, Ah);
|
||||
const float tref = get_time();
|
||||
cholesky(ts, nt, (double* (*)[nt]) Ah);
|
||||
const float time = get_time() - tref;
|
||||
convert_to_linear(ts, nt, n, Ah, (double (*)[n]) matrix);
|
||||
|
||||
// Free blocked matrix
|
||||
for (int i = 0; i < nt; i++) {
|
||||
for (int j = 0; j < nt; j++) {
|
||||
assert(Ah[i][j] != NULL);
|
||||
free(Ah[i][j]);
|
||||
}
|
||||
}
|
||||
|
||||
// Check result, if requested
|
||||
if ( check ) {
|
||||
const char uplo = 'L';
|
||||
if ( check_factorization( n, original_matrix, matrix, n, uplo, eps) ) check++;
|
||||
}
|
||||
|
||||
// Free original matrix, not needed anymore
|
||||
free(original_matrix);
|
||||
|
||||
// Compute GFLOPs (Not verified)
|
||||
float gflops = (((1.0 / 3.0) * n * n * n) / ((time) * 1.0e+9));
|
||||
|
||||
// Print results
|
||||
#ifdef VERBOSE
|
||||
printf( "\n" );
|
||||
printf( "============ CHOLESKY RESULTS ============\n" );
|
||||
printf( " test %s\n", argv[0]);
|
||||
printf( " version %s\n", version);
|
||||
printf( " matrix size: %dx%d\n", n, n);
|
||||
printf( " tile size: %dx%d\n", ts, ts);
|
||||
printf( " number of threads: %d\n", omp_get_max_threads());
|
||||
printf( " time (s): %f\n", time);
|
||||
printf( " performance (gflops): %f\n", gflops);
|
||||
printf( " check: %s\n", result[check]);
|
||||
printf( "==========================================\n" );
|
||||
#else
|
||||
printf("test, %s, version, %s, n, %d, ts, %d, num_threads, %d, gflops, %f, time, %f, check, %s\n", argv[0], version, n, ts, omp_get_max_threads(), gflops, time, result[check]);
|
||||
#endif
|
||||
|
||||
// Free matrix
|
||||
free(matrix);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
23
00-worksharing/cholesky-for/Makefile
Normal file
23
00-worksharing/cholesky-for/Makefile
Normal file
|
@ -0,0 +1,23 @@
|
|||
BIN_DIR=../bin
|
||||
PROGRAM=$(BIN_DIR)/cholesky-for
|
||||
|
||||
MKL_HOME=$(MKLROOT)
|
||||
|
||||
CC=gcc
|
||||
CFLAGS=-O3 -std=gnu99 -I$(MKL_HOME)/include -fopenmp
|
||||
LDFLAGS=-L$(MKL_HOME)/lib/intel64 -lmkl_sequential -lmkl_core -lmkl_rt -lpthread -lm -fopenmp
|
||||
#VERBOSE=-DVERBOSE
|
||||
|
||||
all: $(PROGRAM)
|
||||
|
||||
$(PROGRAM): cholesky.c
|
||||
$(CC) $(CFLAGS) $(VERBOSE) -o $@ $^ $(LDFLAGS)
|
||||
|
||||
$(BIN_DIR):
|
||||
mkdir $@
|
||||
|
||||
clean:
|
||||
rm -rf $(PROGRAM) *.o
|
||||
|
||||
wipe: clean
|
||||
rm -rf *.out *.err
|
289
00-worksharing/cholesky-for/cholesky.c
Normal file
289
00-worksharing/cholesky-for/cholesky.c
Normal file
|
@ -0,0 +1,289 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <errno.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include <sys/time.h>
|
||||
#include <sys/times.h>
|
||||
|
||||
#include <math.h>
|
||||
#include <mkl.h>
|
||||
|
||||
#include "omp.h"
|
||||
|
||||
#if !defined(_OPENMP)
|
||||
int omp_get_max_threads() { return 1; }
|
||||
int omp_get_num_threads() { return 1; }
|
||||
#endif
|
||||
|
||||
void cholesky(int ts, int nt, double* Ah[nt][nt])
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Computing Cholesky Factorization: indirect blocked matrix...\n");
|
||||
#endif
|
||||
|
||||
for (int k = 0; k < nt; k++) {
|
||||
// Diagonal Block factorization: using LAPACK
|
||||
LAPACKE_dpotrf(LAPACK_COL_MAJOR, 'L', ts, Ah[k][k], ts);
|
||||
|
||||
// Triangular systems
|
||||
for (int i = k + 1; i < nt; i++) {
|
||||
cblas_dtrsm(CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, ts, ts, 1.0, Ah[k][k], ts, Ah[k][i], ts);
|
||||
}
|
||||
|
||||
// Update trailing matrix
|
||||
for (int i = k + 1; i < nt; i++) {
|
||||
for (int j = k + 1; j < i; j++) {
|
||||
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, ts, ts, ts, -1.0, Ah[k][i], ts, Ah[k][j], ts, 1.0, Ah[j][i], ts);
|
||||
}
|
||||
cblas_dsyrk(CblasColMajor, CblasLower, CblasNoTrans, ts, ts, -1.0, Ah[k][i], ts, 1.0, Ah[i][i], ts);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef VERBOSE
|
||||
printf("> ...end of Cholesky Factorization.\n");
|
||||
#endif
|
||||
}
|
||||
|
||||
float get_time()
|
||||
{
|
||||
static double gtod_ref_time_sec = 0.0;
|
||||
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, NULL);
|
||||
|
||||
// If this is the first invocation of through dclock(), then initialize the
|
||||
// "reference time" global variable to the seconds field of the tv struct.
|
||||
if (gtod_ref_time_sec == 0.0) gtod_ref_time_sec = (double) tv.tv_sec;
|
||||
|
||||
// Normalize the seconds field of the tv struct so that it is relative to the
|
||||
// "reference time" that was recorded during the first invocation of dclock().
|
||||
const double norm_sec = (double) tv.tv_sec - gtod_ref_time_sec;
|
||||
|
||||
// Compute the number of seconds since the reference time.
|
||||
const double t = norm_sec + tv.tv_usec * 1.0e-6;
|
||||
|
||||
return (float) t;
|
||||
}
|
||||
|
||||
// Robust Check the factorization of the matrix A2
|
||||
// Using directly Fortran services: dlacpy_, dtrmm, dlange_
|
||||
static int check_factorization(int N, double *A1, double *A2, int LDA, char uplo, double eps)
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Checking the Cholesky Factorization... \n");
|
||||
#endif
|
||||
char NORM = 'I', ALL = 'A', UP = 'U', LO = 'L', TR = 'T', NU = 'N', RI = 'R';
|
||||
|
||||
double *Residual = (double *) malloc(N*N*sizeof(double));
|
||||
double *L1 = (double *) malloc(N*N*sizeof(double));
|
||||
double *L2 = (double *) malloc(N*N*sizeof(double));
|
||||
double *work = (double *) malloc(N*sizeof(double));
|
||||
|
||||
memset((void*)L1, 0, N*N*sizeof(double));
|
||||
memset((void*)L2, 0, N*N*sizeof(double));
|
||||
|
||||
double alpha= 1.0;
|
||||
|
||||
dlacpy_(&ALL, &N, &N, A1, &LDA, Residual, &N);
|
||||
|
||||
/* Dealing with L'L or U'U */
|
||||
if (uplo == 'U'){
|
||||
dlacpy_(&UP, &N, &N, A2, &LDA, L1, &N);
|
||||
dlacpy_(&UP, &N, &N, A2, &LDA, L2, &N);
|
||||
dtrmm(&LO, &uplo, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
|
||||
} else{
|
||||
dlacpy_(&LO, &N, &N, A2, &LDA, L1, &N);
|
||||
dlacpy_(&LO, &N, &N, A2, &LDA, L2, &N);
|
||||
dtrmm(&RI, &LO, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
|
||||
}
|
||||
|
||||
/* Compute the Residual || A -L'L|| */
|
||||
for (int i = 0; i < N; i++)
|
||||
for (int j = 0; j < N; j++)
|
||||
Residual[j*N+i] = L2[j*N+i] - Residual[j*N+i];
|
||||
|
||||
double Rnorm = dlange_(&NORM, &N, &N, Residual, &N, work);
|
||||
double Anorm = dlange_(&NORM, &N, &N, A1, &N, work);
|
||||
|
||||
#ifdef VERBOSE
|
||||
printf("> - ||L'L-A||_oo/(||A||_oo.N.eps) = %e \n",Rnorm / (Anorm*N*eps));
|
||||
#endif
|
||||
|
||||
const int info_factorization = isnan(Rnorm/(Anorm*N*eps)) || isinf(Rnorm/(Anorm*N*eps)) || (Rnorm/(Anorm*N*eps) > 60.0);
|
||||
|
||||
#ifdef VERBOSE
|
||||
if ( info_factorization) printf("> - Factorization is suspicious!\n");
|
||||
else printf("> - Factorization is CORRECT!\n");
|
||||
#endif
|
||||
|
||||
free(Residual); free(L1); free(L2); free(work);
|
||||
|
||||
return info_factorization;
|
||||
}
|
||||
|
||||
void initialize_matrix(const int n, const int ts, double *matrix)
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Initializing matrix with random values...\n");
|
||||
#endif
|
||||
|
||||
int ISEED[4] = {0,0,0,1};
|
||||
int intONE=1;
|
||||
|
||||
for (int i = 0; i < n*n; i+=n) {
|
||||
dlarnv_(&intONE, &ISEED[0], &n, &matrix[i]);
|
||||
}
|
||||
|
||||
for (int i=0; i<n; i++) {
|
||||
for (int j=0; j<n; j++) {
|
||||
matrix[j*n+i] = matrix[j*n+i] + matrix[i*n+j];
|
||||
matrix[i*n+j] = matrix[j*n+i];
|
||||
}
|
||||
}
|
||||
|
||||
// Diagonal values
|
||||
for (int i = 0; i < n; i++) {
|
||||
matrix[i*n+i] += (double) n;
|
||||
}
|
||||
}
|
||||
|
||||
void gather_block(const int N, const int ts, double *Alin, double *A)
|
||||
{
|
||||
for (int i = 0; i < ts; i++)
|
||||
for (int j = 0; j < ts; j++) {
|
||||
A[i*ts + j] = Alin[i*N + j];
|
||||
}
|
||||
}
|
||||
|
||||
void scatter_block(const int N, const int ts, double *A, double *Alin)
|
||||
{
|
||||
for (int i = 0; i < ts; i++)
|
||||
for (int j = 0; j < ts; j++) {
|
||||
Alin[i*N + j] = A[i*ts + j];
|
||||
}
|
||||
}
|
||||
|
||||
void convert_to_blocks(const int ts, const int DIM, const int N, double Alin[N][N], double *A[DIM][DIM])
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Converting linear matrix to blocks...\n");
|
||||
#endif
|
||||
for (int i = 0; i < DIM; i++)
|
||||
for (int j = 0; j < DIM; j++) {
|
||||
gather_block ( N, ts, &Alin[i*ts][j*ts], A[i][j]);
|
||||
}
|
||||
}
|
||||
|
||||
void convert_to_linear(const int ts, const int DIM, const int N, double *A[DIM][DIM], double Alin[N][N])
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Converting blocked matrix to linear...\n");
|
||||
#endif
|
||||
for (int i = 0; i < DIM; i++)
|
||||
for (int j = 0; j < DIM; j++) {
|
||||
scatter_block ( N, ts, A[i][j], (double *) &Alin[i*ts][j*ts]);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
char *result[3] = {"n/a","pass","FAIL"};
|
||||
const double eps = pow(2.0, -53);
|
||||
|
||||
// If number of arguments is not 4, print help
|
||||
if ( argc != 4) {
|
||||
printf( "%s: matrix_size block_size check[0|1]?\n", argv[0] );
|
||||
exit( -1 );
|
||||
}
|
||||
|
||||
const int n = atoi(argv[1]); // matrix size
|
||||
const int ts = atoi(argv[2]); // tile size
|
||||
int check = atoi(argv[3]); // check result?
|
||||
|
||||
// Compute number of tiles
|
||||
const int nt = n / ts;
|
||||
assert((nt*ts) == n); // tile size should divide size
|
||||
|
||||
// Allocate matrix
|
||||
double * const matrix = (double *) malloc(n * n * sizeof(double));
|
||||
assert(matrix != NULL);
|
||||
|
||||
// Initialize matrix
|
||||
initialize_matrix(n, ts, matrix);
|
||||
|
||||
// Allocate original matrix, and duplicate it, for debugging purposes
|
||||
double * const original_matrix = (double *) malloc(n * n * sizeof(double));
|
||||
assert(original_matrix != NULL);
|
||||
|
||||
// Save a copy of the original matrix
|
||||
for (int i = 0; i < n * n; i++ ) {
|
||||
original_matrix[i] = matrix[i];
|
||||
}
|
||||
|
||||
// Set version description: Indirect blocked matrix
|
||||
const char *version = "I-Blocked matrix";
|
||||
|
||||
// Allocate blocked matrix
|
||||
double *Ah[nt][nt];
|
||||
for (int i = 0; i < nt; i++) {
|
||||
for (int j = 0; j < nt; j++) {
|
||||
Ah[i][j] = malloc(ts * ts * sizeof(double));
|
||||
assert(Ah[i][j] != NULL);
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------
|
||||
// Convert, compute (time), and re-convert
|
||||
// ---------------------------------------
|
||||
convert_to_blocks(ts, nt, n, (double(*)[n]) matrix, Ah);
|
||||
const float tref = get_time();
|
||||
cholesky(ts, nt, (double* (*)[nt]) Ah);
|
||||
const float time = get_time() - tref;
|
||||
convert_to_linear(ts, nt, n, Ah, (double (*)[n]) matrix);
|
||||
|
||||
// Free blocked matrix
|
||||
for (int i = 0; i < nt; i++) {
|
||||
for (int j = 0; j < nt; j++) {
|
||||
assert(Ah[i][j] != NULL);
|
||||
free(Ah[i][j]);
|
||||
}
|
||||
}
|
||||
|
||||
// Check result, if requested
|
||||
if ( check ) {
|
||||
const char uplo = 'L';
|
||||
if ( check_factorization( n, original_matrix, matrix, n, uplo, eps) ) check++;
|
||||
}
|
||||
|
||||
// Free original matrix, not needed anymore
|
||||
free(original_matrix);
|
||||
|
||||
// Compute GFLOPs (Not verified)
|
||||
float gflops = (((1.0 / 3.0) * n * n * n) / ((time) * 1.0e+9));
|
||||
|
||||
// Print results
|
||||
#ifdef VERBOSE
|
||||
printf( "\n" );
|
||||
printf( "============ CHOLESKY RESULTS ============\n" );
|
||||
printf( " test %s\n", argv[0]);
|
||||
printf( " version %s\n", version);
|
||||
printf( " matrix size: %dx%d\n", n, n);
|
||||
printf( " tile size: %dx%d\n", ts, ts);
|
||||
printf( " number of threads: %d\n", omp_get_max_threads());
|
||||
printf( " time (s): %f\n", time);
|
||||
printf( " performance (gflops): %f\n", gflops);
|
||||
printf( " check: %s\n", result[check]);
|
||||
printf( "==========================================\n" );
|
||||
#else
|
||||
printf("test, %s, version, %s, n, %d, ts, %d, num_threads, %d, gflops, %f, time, %f, check, %s\n", argv[0], version, n, ts, omp_get_max_threads(), gflops, time, result[check]);
|
||||
#endif
|
||||
|
||||
// Free matrix
|
||||
free(matrix);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
291
00-worksharing/cholesky-for/solution/cholesky.c
Normal file
291
00-worksharing/cholesky-for/solution/cholesky.c
Normal file
|
@ -0,0 +1,291 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <errno.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include <sys/time.h>
|
||||
#include <sys/times.h>
|
||||
|
||||
#include <math.h>
|
||||
#include <mkl.h>
|
||||
|
||||
#include "omp.h"
|
||||
|
||||
#if !defined(_OPENMP)
|
||||
int omp_get_max_threads() { return 1; }
|
||||
int omp_get_num_threads() { return 1; }
|
||||
#endif
|
||||
|
||||
void cholesky(int ts, int nt, double* Ah[nt][nt])
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Computing Cholesky Factorization: indirect blocked matrix...\n");
|
||||
#endif
|
||||
|
||||
for (int k = 0; k < nt; k++) {
|
||||
// Diagonal Block factorization: using LAPACK
|
||||
LAPACKE_dpotrf(LAPACK_COL_MAJOR, 'L', ts, Ah[k][k], ts);
|
||||
|
||||
// Triangular systems
|
||||
#pragma omp parallel for
|
||||
for (int i = k + 1; i < nt; i++) {
|
||||
cblas_dtrsm(CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, ts, ts, 1.0, Ah[k][k], ts, Ah[k][i], ts);
|
||||
}
|
||||
|
||||
// Update trailing matrix
|
||||
#pragma omp parallel for
|
||||
for (int i = k + 1; i < nt; i++) {
|
||||
for (int j = k + 1; j < i; j++) {
|
||||
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, ts, ts, ts, -1.0, Ah[k][i], ts, Ah[k][j], ts, 1.0, Ah[j][i], ts);
|
||||
}
|
||||
cblas_dsyrk(CblasColMajor, CblasLower, CblasNoTrans, ts, ts, -1.0, Ah[k][i], ts, 1.0, Ah[i][i], ts);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef VERBOSE
|
||||
printf("> ...end of Cholesky Factorization.\n");
|
||||
#endif
|
||||
}
|
||||
|
||||
float get_time()
|
||||
{
|
||||
static double gtod_ref_time_sec = 0.0;
|
||||
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, NULL);
|
||||
|
||||
// If this is the first invocation of through dclock(), then initialize the
|
||||
// "reference time" global variable to the seconds field of the tv struct.
|
||||
if (gtod_ref_time_sec == 0.0) gtod_ref_time_sec = (double) tv.tv_sec;
|
||||
|
||||
// Normalize the seconds field of the tv struct so that it is relative to the
|
||||
// "reference time" that was recorded during the first invocation of dclock().
|
||||
const double norm_sec = (double) tv.tv_sec - gtod_ref_time_sec;
|
||||
|
||||
// Compute the number of seconds since the reference time.
|
||||
const double t = norm_sec + tv.tv_usec * 1.0e-6;
|
||||
|
||||
return (float) t;
|
||||
}
|
||||
|
||||
// Robust Check the factorization of the matrix A2
|
||||
// Using directly Fortran services: dlacpy_, dtrmm, dlange_
|
||||
static int check_factorization(int N, double *A1, double *A2, int LDA, char uplo, double eps)
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Checking the Cholesky Factorization... \n");
|
||||
#endif
|
||||
char NORM = 'I', ALL = 'A', UP = 'U', LO = 'L', TR = 'T', NU = 'N', RI = 'R';
|
||||
|
||||
double *Residual = (double *) malloc(N*N*sizeof(double));
|
||||
double *L1 = (double *) malloc(N*N*sizeof(double));
|
||||
double *L2 = (double *) malloc(N*N*sizeof(double));
|
||||
double *work = (double *) malloc(N*sizeof(double));
|
||||
|
||||
memset((void*)L1, 0, N*N*sizeof(double));
|
||||
memset((void*)L2, 0, N*N*sizeof(double));
|
||||
|
||||
double alpha= 1.0;
|
||||
|
||||
dlacpy_(&ALL, &N, &N, A1, &LDA, Residual, &N);
|
||||
|
||||
/* Dealing with L'L or U'U */
|
||||
if (uplo == 'U'){
|
||||
dlacpy_(&UP, &N, &N, A2, &LDA, L1, &N);
|
||||
dlacpy_(&UP, &N, &N, A2, &LDA, L2, &N);
|
||||
dtrmm(&LO, &uplo, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
|
||||
} else{
|
||||
dlacpy_(&LO, &N, &N, A2, &LDA, L1, &N);
|
||||
dlacpy_(&LO, &N, &N, A2, &LDA, L2, &N);
|
||||
dtrmm(&RI, &LO, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
|
||||
}
|
||||
|
||||
/* Compute the Residual || A -L'L|| */
|
||||
for (int i = 0; i < N; i++)
|
||||
for (int j = 0; j < N; j++)
|
||||
Residual[j*N+i] = L2[j*N+i] - Residual[j*N+i];
|
||||
|
||||
double Rnorm = dlange_(&NORM, &N, &N, Residual, &N, work);
|
||||
double Anorm = dlange_(&NORM, &N, &N, A1, &N, work);
|
||||
|
||||
#ifdef VERBOSE
|
||||
printf("> - ||L'L-A||_oo/(||A||_oo.N.eps) = %e \n",Rnorm / (Anorm*N*eps));
|
||||
#endif
|
||||
|
||||
const int info_factorization = isnan(Rnorm/(Anorm*N*eps)) || isinf(Rnorm/(Anorm*N*eps)) || (Rnorm/(Anorm*N*eps) > 60.0);
|
||||
|
||||
#ifdef VERBOSE
|
||||
if ( info_factorization) printf("> - Factorization is suspicious!\n");
|
||||
else printf("> - Factorization is CORRECT!\n");
|
||||
#endif
|
||||
|
||||
free(Residual); free(L1); free(L2); free(work);
|
||||
|
||||
return info_factorization;
|
||||
}
|
||||
|
||||
void initialize_matrix(const int n, const int ts, double *matrix)
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Initializing matrix with random values...\n");
|
||||
#endif
|
||||
|
||||
int ISEED[4] = {0,0,0,1};
|
||||
int intONE=1;
|
||||
|
||||
for (int i = 0; i < n*n; i+=n) {
|
||||
dlarnv_(&intONE, &ISEED[0], &n, &matrix[i]);
|
||||
}
|
||||
|
||||
for (int i=0; i<n; i++) {
|
||||
for (int j=0; j<n; j++) {
|
||||
matrix[j*n+i] = matrix[j*n+i] + matrix[i*n+j];
|
||||
matrix[i*n+j] = matrix[j*n+i];
|
||||
}
|
||||
}
|
||||
|
||||
// Diagonal values
|
||||
for (int i = 0; i < n; i++) {
|
||||
matrix[i*n+i] += (double) n;
|
||||
}
|
||||
}
|
||||
|
||||
void gather_block(const int N, const int ts, double *Alin, double *A)
|
||||
{
|
||||
for (int i = 0; i < ts; i++)
|
||||
for (int j = 0; j < ts; j++) {
|
||||
A[i*ts + j] = Alin[i*N + j];
|
||||
}
|
||||
}
|
||||
|
||||
void scatter_block(const int N, const int ts, double *A, double *Alin)
|
||||
{
|
||||
for (int i = 0; i < ts; i++)
|
||||
for (int j = 0; j < ts; j++) {
|
||||
Alin[i*N + j] = A[i*ts + j];
|
||||
}
|
||||
}
|
||||
|
||||
void convert_to_blocks(const int ts, const int DIM, const int N, double Alin[N][N], double *A[DIM][DIM])
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Converting linear matrix to blocks...\n");
|
||||
#endif
|
||||
for (int i = 0; i < DIM; i++)
|
||||
for (int j = 0; j < DIM; j++) {
|
||||
gather_block ( N, ts, &Alin[i*ts][j*ts], A[i][j]);
|
||||
}
|
||||
}
|
||||
|
||||
void convert_to_linear(const int ts, const int DIM, const int N, double *A[DIM][DIM], double Alin[N][N])
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Converting blocked matrix to linear...\n");
|
||||
#endif
|
||||
for (int i = 0; i < DIM; i++)
|
||||
for (int j = 0; j < DIM; j++) {
|
||||
scatter_block ( N, ts, A[i][j], (double *) &Alin[i*ts][j*ts]);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
char *result[3] = {"n/a","pass","FAIL"};
|
||||
const double eps = pow(2.0, -53);
|
||||
|
||||
// If number of arguments is not 4, print help
|
||||
if ( argc != 4) {
|
||||
printf( "%s: matrix_size block_size check[0|1]?\n", argv[0] );
|
||||
exit( -1 );
|
||||
}
|
||||
|
||||
const int n = atoi(argv[1]); // matrix size
|
||||
const int ts = atoi(argv[2]); // tile size
|
||||
int check = atoi(argv[3]); // check result?
|
||||
|
||||
// Compute number of tiles
|
||||
const int nt = n / ts;
|
||||
assert((nt*ts) == n); // tile size should divide size
|
||||
|
||||
// Allocate matrix
|
||||
double * const matrix = (double *) malloc(n * n * sizeof(double));
|
||||
assert(matrix != NULL);
|
||||
|
||||
// Initialize matrix
|
||||
initialize_matrix(n, ts, matrix);
|
||||
|
||||
// Allocate original matrix, and duplicate it, for debugging purposes
|
||||
double * const original_matrix = (double *) malloc(n * n * sizeof(double));
|
||||
assert(original_matrix != NULL);
|
||||
|
||||
// Save a copy of the original matrix
|
||||
for (int i = 0; i < n * n; i++ ) {
|
||||
original_matrix[i] = matrix[i];
|
||||
}
|
||||
|
||||
// Set version description: Indirect blocked matrix
|
||||
const char *version = "I-Blocked matrix";
|
||||
|
||||
// Allocate blocked matrix
|
||||
double *Ah[nt][nt];
|
||||
for (int i = 0; i < nt; i++) {
|
||||
for (int j = 0; j < nt; j++) {
|
||||
Ah[i][j] = malloc(ts * ts * sizeof(double));
|
||||
assert(Ah[i][j] != NULL);
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------
|
||||
// Convert, compute (time), and re-convert
|
||||
// ---------------------------------------
|
||||
convert_to_blocks(ts, nt, n, (double(*)[n]) matrix, Ah);
|
||||
const float tref = get_time();
|
||||
cholesky(ts, nt, (double* (*)[nt]) Ah);
|
||||
const float time = get_time() - tref;
|
||||
convert_to_linear(ts, nt, n, Ah, (double (*)[n]) matrix);
|
||||
|
||||
// Free blocked matrix
|
||||
for (int i = 0; i < nt; i++) {
|
||||
for (int j = 0; j < nt; j++) {
|
||||
assert(Ah[i][j] != NULL);
|
||||
free(Ah[i][j]);
|
||||
}
|
||||
}
|
||||
|
||||
// Check result, if requested
|
||||
if ( check ) {
|
||||
const char uplo = 'L';
|
||||
if ( check_factorization( n, original_matrix, matrix, n, uplo, eps) ) check++;
|
||||
}
|
||||
|
||||
// Free original matrix, not needed anymore
|
||||
free(original_matrix);
|
||||
|
||||
// Compute GFLOPs (Not verified)
|
||||
float gflops = (((1.0 / 3.0) * n * n * n) / ((time) * 1.0e+9));
|
||||
|
||||
// Print results
|
||||
#ifdef VERBOSE
|
||||
printf( "\n" );
|
||||
printf( "============ CHOLESKY RESULTS ============\n" );
|
||||
printf( " test %s\n", argv[0]);
|
||||
printf( " version %s\n", version);
|
||||
printf( " matrix size: %dx%d\n", n, n);
|
||||
printf( " tile size: %dx%d\n", ts, ts);
|
||||
printf( " number of threads: %d\n", omp_get_max_threads());
|
||||
printf( " time (s): %f\n", time);
|
||||
printf( " performance (gflops): %f\n", gflops);
|
||||
printf( " check: %s\n", result[check]);
|
||||
printf( "==========================================\n" );
|
||||
#else
|
||||
printf("test, %s, version, %s, n, %d, ts, %d, num_threads, %d, gflops, %f, time, %f, check, %s\n", argv[0], version, n, ts, omp_get_max_threads(), gflops, time, result[check]);
|
||||
#endif
|
||||
|
||||
// Free matrix
|
||||
free(matrix);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
23
00-worksharing/matmul/Makefile
Normal file
23
00-worksharing/matmul/Makefile
Normal file
|
@ -0,0 +1,23 @@
|
|||
BIN_DIR=../bin
|
||||
PROGRAM=$(BIN_DIR)/matmul
|
||||
|
||||
MKL_HOME=$(MKLROOT)
|
||||
|
||||
CC=gcc
|
||||
CFLAGS=-O3 -std=gnu99 -fopenmp
|
||||
LDFLAGS=-fopenmp
|
||||
#VERBOSE=-DVERBOSE
|
||||
|
||||
all: $(PROGRAM)
|
||||
|
||||
$(PROGRAM): matmul.c
|
||||
$(CC) $(CFLAGS) $(VERBOSE) -o $@ $^ $(LDFLAGS)
|
||||
|
||||
$(BIN_DIR):
|
||||
mkdir $@
|
||||
|
||||
clean:
|
||||
rm -rf $(PROGRAM) *.o
|
||||
|
||||
wipe: clean
|
||||
rm -rf *.out *.err
|
106
00-worksharing/matmul/matmul.c
Normal file
106
00-worksharing/matmul/matmul.c
Normal file
|
@ -0,0 +1,106 @@
|
|||
//===-- matmul.c - Different implementations of matrix multiplies -*- C -*-===//
|
||||
//
|
||||
// Part of the LOMP Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <omp.h>
|
||||
|
||||
#define DUMP_MATRIX 0
|
||||
|
||||
void matmul_seq(double * C, double * A, double * B, size_t n) {
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
for (size_t k = 0; k < n; ++k) {
|
||||
for (size_t j = 0; j < n; ++j) {
|
||||
C[i * n + j] += A[i * n + k] * B[k * n + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void matmul_par(double * C, double * A, double * B, size_t n) {
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
for (size_t k = 0; k < n; ++k) {
|
||||
for (size_t j = 0; j < n; ++j) {
|
||||
C[i * n + j] += A[i * n + k] * B[k * n + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void init_mat(double * C, double * A, double * B, size_t n) {
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
for (size_t j = 0; j < n; ++j) {
|
||||
C[i * n + j] = 0.0;
|
||||
A[i * n + j] = 0.5;
|
||||
B[i * n + j] = 0.25;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void dump_mat(double * mtx, size_t n) {
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
for (size_t j = 0; j < n; ++j) {
|
||||
printf("%f ", mtx[i * n + j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
double sum_mat(double * mtx, size_t n) {
|
||||
double sum = 0.0;
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
for (size_t j = 0; j < n; ++j) {
|
||||
sum += mtx[i * n + j];
|
||||
}
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
int main(int argc, char * argv[]) {
|
||||
double ts, te;
|
||||
double t_seq;
|
||||
|
||||
double * C;
|
||||
double * A;
|
||||
double * B;
|
||||
|
||||
// If number of arguments is not 1, print help
|
||||
if (argc != 2) {
|
||||
printf("%s: matrix_size\n", argv[0]);
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
const int n = atoi(argv[1]); // matrix size
|
||||
|
||||
C = (double *)malloc(sizeof(*C) * n * n);
|
||||
A = (double *)malloc(sizeof(*A) * n * n);
|
||||
B = (double *)malloc(sizeof(*B) * n * n);
|
||||
|
||||
init_mat(C, A, B, n);
|
||||
ts = omp_get_wtime();
|
||||
matmul_seq(C, A, B, n);
|
||||
te = omp_get_wtime();
|
||||
#if DUMP_MATRIX
|
||||
dump_mat(C, n);
|
||||
#endif
|
||||
t_seq = te - ts;
|
||||
printf("Sum of matrix (serial): %f, wall time %lf, speed-up %.2lf\n",
|
||||
sum_mat(C, n), (te - ts), t_seq / (te - ts));
|
||||
|
||||
init_mat(C, A, B, n);
|
||||
ts = omp_get_wtime();
|
||||
matmul_par(C, A, B, n);
|
||||
te = omp_get_wtime();
|
||||
#if DUMP_MATRIX
|
||||
dump_mat(C, n);
|
||||
#endif
|
||||
printf("Sum of matrix (parallel): %f, wall time %lf, speed-up %.2lf\n",
|
||||
sum_mat(C, n), (te - ts), t_seq / (te - ts));
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
108
00-worksharing/matmul/solution/matmul.c
Normal file
108
00-worksharing/matmul/solution/matmul.c
Normal file
|
@ -0,0 +1,108 @@
|
|||
//===-- matmul.c - Different implementations of matrix multiplies -*- C -*-===//
|
||||
//
|
||||
// Part of the LOMP Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <omp.h>
|
||||
|
||||
#define DUMP_MATRIX 0
|
||||
|
||||
void matmul_seq(double * C, double * A, double * B, size_t n) {
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
for (size_t k = 0; k < n; ++k) {
|
||||
for (size_t j = 0; j < n; ++j) {
|
||||
C[i * n + j] += A[i * n + k] * B[k * n + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void matmul_par(double * C, double * A, double * B, size_t n) {
|
||||
#pragma omp parallel for shared(A,B,C) firstprivate(n) \
|
||||
schedule(static) // collapse(2)
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
for (size_t k = 0; k < n; ++k) {
|
||||
for (size_t j = 0; j < n; ++j) {
|
||||
C[i * n + j] += A[i * n + k] * B[k * n + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void init_mat(double * C, double * A, double * B, size_t n) {
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
for (size_t j = 0; j < n; ++j) {
|
||||
C[i * n + j] = 0.0;
|
||||
A[i * n + j] = 0.5;
|
||||
B[i * n + j] = 0.25;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void dump_mat(double * mtx, size_t n) {
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
for (size_t j = 0; j < n; ++j) {
|
||||
printf("%f ", mtx[i * n + j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
double sum_mat(double * mtx, size_t n) {
|
||||
double sum = 0.0;
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
for (size_t j = 0; j < n; ++j) {
|
||||
sum += mtx[i * n + j];
|
||||
}
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
int main(int argc, char * argv[]) {
|
||||
double ts, te;
|
||||
double t_seq;
|
||||
|
||||
double * C;
|
||||
double * A;
|
||||
double * B;
|
||||
|
||||
// If number of arguments is not 1, print help
|
||||
if (argc != 2) {
|
||||
printf("%s: matrix_size\n", argv[0]);
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
const int n = atoi(argv[1]); // matrix size
|
||||
|
||||
C = (double *)malloc(sizeof(*C) * n * n);
|
||||
A = (double *)malloc(sizeof(*A) * n * n);
|
||||
B = (double *)malloc(sizeof(*B) * n * n);
|
||||
|
||||
init_mat(C, A, B, n);
|
||||
ts = omp_get_wtime();
|
||||
matmul_seq(C, A, B, n);
|
||||
te = omp_get_wtime();
|
||||
#if DUMP_MATRIX
|
||||
dump_mat(C, n);
|
||||
#endif
|
||||
t_seq = te - ts;
|
||||
printf("Sum of matrix (serial): %f, wall time %lf, speed-up %.2lf\n",
|
||||
sum_mat(C, n), (te - ts), t_seq / (te - ts));
|
||||
|
||||
init_mat(C, A, B, n);
|
||||
ts = omp_get_wtime();
|
||||
matmul_par(C, A, B, n);
|
||||
te = omp_get_wtime();
|
||||
#if DUMP_MATRIX
|
||||
dump_mat(C, n);
|
||||
#endif
|
||||
printf("Sum of matrix (parallel): %f, wall time %lf, speed-up %.2lf\n",
|
||||
sum_mat(C, n), (te - ts), t_seq / (te - ts));
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
21
00-worksharing/sin-cos/Makefile
Normal file
21
00-worksharing/sin-cos/Makefile
Normal file
|
@ -0,0 +1,21 @@
|
|||
BIN_DIR=../bin
|
||||
PROGRAM=$(BIN_DIR)/sin-cos
|
||||
|
||||
CC=gcc
|
||||
CFLAGS=-O3 -std=gnu99 -fopenmp
|
||||
LDFLAGS=-lm -fopenmp
|
||||
#VERBOSE=-DVERBOSE
|
||||
|
||||
all: $(PROGRAM)
|
||||
|
||||
$(PROGRAM): sin-cos.c
|
||||
$(CC) $(CFLAGS) $(VERBOSE) -o $@ $^ $(LDFLAGS)
|
||||
|
||||
$(BIN_DIR):
|
||||
mkdir $@
|
||||
|
||||
clean:
|
||||
rm -rf $(PROGRAM) *.o
|
||||
|
||||
wipe: clean
|
||||
rm -rf *.out *.err
|
58
00-worksharing/sin-cos/sin-cos.c
Normal file
58
00-worksharing/sin-cos/sin-cos.c
Normal file
|
@ -0,0 +1,58 @@
|
|||
/*
|
||||
* OpenMP lecture exercises
|
||||
* Copyright (C) 2011 by Christian Terboven <terboven@rz.rwth-aachen.de>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include <omp.h>
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
|
||||
double do_some_computation(int i)
|
||||
{
|
||||
double t = 0.0;
|
||||
int j;
|
||||
for (j = 0; j < i*i; j++)
|
||||
{
|
||||
t += sin((double)j) * cos((double)j);
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
const int dimension = 500;
|
||||
int i;
|
||||
double result = 0.0;
|
||||
|
||||
double t1 = omp_get_wtime();
|
||||
|
||||
for (i = 0; i < dimension; i++)
|
||||
{
|
||||
result += do_some_computation(i);
|
||||
}
|
||||
|
||||
double t2 = omp_get_wtime();
|
||||
printf("Computation took %.3lf seconds.\n", t2 - t1);
|
||||
printf("Result is %.3lf.\n", result);
|
||||
|
||||
return 0;
|
||||
}
|
59
00-worksharing/sin-cos/solution/sin-cos.c
Normal file
59
00-worksharing/sin-cos/solution/sin-cos.c
Normal file
|
@ -0,0 +1,59 @@
|
|||
/*
|
||||
* OpenMP lecture exercises
|
||||
* Copyright (C) 2011 by Christian Terboven <terboven@rz.rwth-aachen.de>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include <omp.h>
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
|
||||
double do_some_computation(int i)
|
||||
{
|
||||
double t = 0.0;
|
||||
int j;
|
||||
for (j = 0; j < i*i; j++)
|
||||
{
|
||||
t += sin((double)j) * cos((double)j);
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
const int dimension = 500;
|
||||
int i;
|
||||
double result = 0.0;
|
||||
|
||||
double t1 = omp_get_wtime();
|
||||
|
||||
#pragma omp parallel for schedule(dynamic) reduction(+:result)
|
||||
for (i = 0; i < dimension; i++)
|
||||
{
|
||||
result += do_some_computation(i);
|
||||
}
|
||||
|
||||
double t2 = omp_get_wtime();
|
||||
printf("Computation took %.3lf seconds.\n", t2 - t1);
|
||||
printf("Result is %.3lf.\n", result);
|
||||
|
||||
return 0;
|
||||
}
|
14
01-tasks/Makefile
Normal file
14
01-tasks/Makefile
Normal file
|
@ -0,0 +1,14 @@
|
|||
ALL_RECURSIVE=all-recursive clean-recursive
|
||||
|
||||
DIRS=$(shell ls -d *-* | sed '/_/d' 2>/dev/null)
|
||||
|
||||
all: $(CONFIG) all-recursive
|
||||
clean: $(CONFIG) clean-recursive
|
||||
|
||||
$(ALL_RECURSIVE):
|
||||
@failcom='exit 1';\
|
||||
target=`echo $@ | sed s/-recursive//`; \
|
||||
for subdir in $(DIRS); do \
|
||||
(cd $$subdir && $(MAKE) $$target) || eval $$failcom; \
|
||||
done;
|
||||
|
0
01-tasks/bin/.gitdir
Normal file
0
01-tasks/bin/.gitdir
Normal file
28
01-tasks/bin/run-cholesky.sh
Executable file
28
01-tasks/bin/run-cholesky.sh
Executable file
|
@ -0,0 +1,28 @@
|
|||
#!/bin/bash
|
||||
#SBATCH --job-name=cholesky
|
||||
#SBATCH --workdir=.
|
||||
#SBATCH --output=cholesky_%j.out
|
||||
#SBATCH --error=cholesky_%j.err
|
||||
#SBATCH --cpus-per-task=48
|
||||
#SBATCH --ntasks=1
|
||||
#SBATCH --time=00:30:00
|
||||
#SBATCH --qos=debug
|
||||
|
||||
export IFS=";"
|
||||
|
||||
THREADS="01;02;04;06;08;10;12;14;16;18;20;22;24;26;28;30;32;34;36;38;40;42;44;46;48"
|
||||
#THREADS="01;02;04;06;08;10;12"
|
||||
|
||||
#MSIZES="2048;4096;8192;16384"
|
||||
MSIZES="4096"
|
||||
|
||||
BSIZES="512"
|
||||
|
||||
for MS in $MSIZES; do
|
||||
for BS in $BSIZES; do
|
||||
for threads in $THREADS; do
|
||||
OMP_NUM_THREADS=$threads ./cholesky-tsk $MS $BS 0
|
||||
done
|
||||
done
|
||||
done
|
||||
|
20
01-tasks/bin/run-sudoku.sh
Executable file
20
01-tasks/bin/run-sudoku.sh
Executable file
|
@ -0,0 +1,20 @@
|
|||
#!/bin/bash
|
||||
#SBATCH --job-name=sudoku
|
||||
#SBATCH --workdir=.
|
||||
#SBATCH --output=sudoku_%j.out
|
||||
#SBATCH --error=sudoku_%j.err
|
||||
#SBATCH --cpus-per-task=48
|
||||
#SBATCH --ntasks=1
|
||||
#SBATCH --time=00:30:00
|
||||
#SBATCH --qos=debug
|
||||
|
||||
export IFS=";"
|
||||
|
||||
THREADS="01;02;04;06;08;10;12;14;16;18;20;22;24;26;28;30;32;34;36;38;40;42;44;46;48"
|
||||
#THREADS="01;02;04;06;08;10;12"
|
||||
|
||||
for threads in $THREADS; do
|
||||
OMP_NUM_THREADS=$threads ./sudoku-tsk 9 3 sudoku-9x9-1.txt
|
||||
#OMP_NUM_THREADS=$threads ./sudoku-tsk 16 4 sudoku-16x16-1.txt
|
||||
#OMP_NUM_THREADS=$threads ./sudoku-tsk 25 5 sudoku-25x25-1.txt
|
||||
done
|
16
01-tasks/bin/sudoku-16x16-1.txt
Normal file
16
01-tasks/bin/sudoku-16x16-1.txt
Normal file
|
@ -0,0 +1,16 @@
|
|||
0 6 0 0 0 0 0 8 11 0 0 15 14 0 0 16
|
||||
15 11 0 0 0 16 14 0 0 0 12 0 0 6 0 0
|
||||
13 0 9 12 0 0 0 0 3 16 14 0 15 11 10 0
|
||||
2 0 16 0 11 0 15 10 1 0 0 0 0 0 0 0
|
||||
0 15 11 10 0 0 16 2 13 8 9 12 0 0 0 0
|
||||
12 13 0 0 4 1 5 6 2 3 0 0 0 0 11 10
|
||||
5 0 6 1 12 0 9 0 15 11 10 7 16 0 0 3
|
||||
0 2 0 0 0 10 0 11 6 0 5 0 0 13 0 9
|
||||
10 7 15 11 16 0 0 0 12 13 0 0 0 0 0 6
|
||||
9 0 0 0 0 0 1 0 0 2 0 16 10 0 0 11
|
||||
1 0 4 6 9 13 0 0 7 0 11 0 3 16 0 0
|
||||
16 14 0 0 7 0 10 15 4 6 1 0 0 0 13 8
|
||||
11 10 0 15 0 0 0 16 9 12 13 0 0 1 5 4
|
||||
0 0 12 0 1 4 6 0 16 0 0 0 11 10 0 0
|
||||
0 0 5 0 8 12 13 0 10 0 0 11 2 0 0 14
|
||||
3 16 0 0 10 0 0 7 0 0 6 0 0 0 12 0
|
25
01-tasks/bin/sudoku-25x25-1.txt
Normal file
25
01-tasks/bin/sudoku-25x25-1.txt
Normal file
|
@ -0,0 +1,25 @@
|
|||
0 8 0 18 0 12 20 21 16 0 0 0 0 0 0 15 0 0 24 5 7 0 19 0 0
|
||||
0 0 15 9 5 4 0 23 0 18 12 20 0 21 16 7 0 0 0 2 1 0 0 0 0
|
||||
0 0 0 0 14 7 0 0 0 0 0 24 0 0 0 0 10 0 0 16 0 0 23 18 6
|
||||
21 20 12 0 16 0 0 25 14 22 7 0 0 19 2 4 0 23 8 6 0 24 0 0 0
|
||||
19 0 7 0 2 15 0 0 0 9 0 0 18 0 0 0 0 0 0 0 12 0 21 0 16
|
||||
0 23 6 0 0 16 0 0 0 20 14 25 11 0 0 5 24 7 3 9 2 19 1 0 17
|
||||
4 21 0 20 0 0 0 12 22 11 2 19 0 0 0 0 0 15 23 0 0 3 7 0 9
|
||||
12 25 14 0 22 0 19 0 0 13 0 0 24 7 0 0 0 0 21 10 6 0 15 8 18
|
||||
7 0 0 24 9 6 0 15 18 0 0 0 0 0 10 0 13 1 19 0 0 25 0 11 22
|
||||
0 19 2 13 17 5 0 0 9 0 6 0 8 15 0 14 11 12 25 22 16 0 0 0 10
|
||||
0 0 0 1 0 0 0 0 3 7 8 0 15 0 0 11 12 10 0 0 0 0 18 4 0
|
||||
10 16 11 12 25 0 14 22 19 1 0 0 0 17 3 20 0 0 6 21 8 0 0 0 0
|
||||
0 0 8 0 23 20 0 18 21 4 11 16 0 0 25 0 7 0 2 0 0 14 22 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 22 19 8 15 0 5 23 24 0 17 0 3
|
||||
0 2 0 7 3 8 0 9 23 15 20 0 4 0 21 13 1 0 14 19 0 0 10 12 25
|
||||
2 0 0 0 0 18 15 5 0 23 0 0 21 6 20 17 0 14 1 13 0 12 0 25 11
|
||||
0 1 0 19 13 9 7 0 24 0 0 0 23 0 8 0 25 0 0 11 0 0 0 21 0
|
||||
0 0 10 0 0 0 0 0 0 0 17 1 0 14 0 18 0 5 15 8 9 0 0 0 0
|
||||
16 12 0 0 11 0 0 14 0 0 0 0 3 2 24 0 21 0 4 20 0 15 0 0 8
|
||||
0 0 0 0 0 10 0 0 0 0 0 12 0 0 11 9 3 0 0 0 17 0 14 19 13
|
||||
24 0 0 0 15 21 18 0 4 0 0 0 16 20 0 3 2 13 0 7 19 22 0 0 1
|
||||
0 18 0 0 0 0 10 0 0 16 0 22 0 0 0 23 0 24 9 0 3 17 0 0 0
|
||||
0 0 0 0 0 0 0 0 15 5 0 0 6 8 4 0 14 11 22 1 25 0 20 16 12
|
||||
0 22 19 14 1 3 0 0 0 0 23 9 5 0 15 25 0 20 10 12 21 0 8 0 4
|
||||
20 0 25 16 0 0 22 11 1 0 3 0 2 13 7 21 0 0 0 0 0 9 0 0 0
|
10
01-tasks/bin/sudoku-9x9-1.txt
Normal file
10
01-tasks/bin/sudoku-9x9-1.txt
Normal file
|
@ -0,0 +1,10 @@
|
|||
5 3 0 0 7 0 0 0 0
|
||||
6 0 0 1 9 5 3 0 0
|
||||
0 9 8 0 0 0 0 6 0
|
||||
8 0 0 0 6 0 0 0 3
|
||||
4 0 0 8 0 3 0 0 1
|
||||
7 0 0 0 2 0 0 0 6
|
||||
0 6 0 0 0 0 2 8 0
|
||||
0 0 0 4 1 9 0 0 5
|
||||
0 0 0 0 8 0 0 7 9
|
||||
|
23
01-tasks/cholesky-tsk/Makefile
Normal file
23
01-tasks/cholesky-tsk/Makefile
Normal file
|
@ -0,0 +1,23 @@
|
|||
BIN_DIR=../bin
|
||||
PROGRAM=$(BIN_DIR)/cholesky-tsk
|
||||
|
||||
MKL_HOME=$(MKLROOT)
|
||||
|
||||
CC=gcc
|
||||
CFLAGS=-O3 -std=gnu99 -I$(MKL_HOME)/include -fopenmp
|
||||
LDFLAGS=-L$(MKL_HOME)/lib/intel64 -lmkl_sequential -lmkl_core -lmkl_rt -lpthread -lm -fopenmp
|
||||
VERBOSE=-DVERBOSE
|
||||
|
||||
all: $(PROGRAM)
|
||||
|
||||
$(PROGRAM): cholesky.c
|
||||
$(CC) $(CFLAGS) $(VERBOSE) -o $@ $^ $(LDFLAGS)
|
||||
|
||||
$(BIN_DIR):
|
||||
mkdir $@
|
||||
|
||||
clean:
|
||||
rm -rf $(PROGRAM) *.o
|
||||
|
||||
wipe: clean
|
||||
rm -rf *.out *.err
|
296
01-tasks/cholesky-tsk/cholesky-solved.c
Normal file
296
01-tasks/cholesky-tsk/cholesky-solved.c
Normal file
|
@ -0,0 +1,296 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <errno.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include <sys/time.h>
|
||||
#include <sys/times.h>
|
||||
|
||||
#include <math.h>
|
||||
#include <mkl.h>
|
||||
|
||||
#include "omp.h"
|
||||
|
||||
#if !defined(_OPENMP)
|
||||
int omp_get_max_threads() { return 1; }
|
||||
int omp_get_num_threads() { return 1; }
|
||||
#endif
|
||||
|
||||
void cholesky(int ts, int nt, double* Ah[nt][nt])
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Computing Cholesky Factorization: indirect blocked matrix...\n");
|
||||
#endif
|
||||
|
||||
#pragma omp parallel
|
||||
#pragma omp single
|
||||
for (int k = 0; k < nt; k++) {
|
||||
// Diagonal Block factorization: using LAPACK
|
||||
LAPACKE_dpotrf(LAPACK_COL_MAJOR, 'L', ts, Ah[k][k], ts);
|
||||
|
||||
// Triangular systems
|
||||
for (int i = k + 1; i < nt; i++) {
|
||||
#pragma omp task
|
||||
cblas_dtrsm(CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, ts, ts, 1.0, Ah[k][k], ts, Ah[k][i], ts);
|
||||
}
|
||||
#pragma omp taskwait
|
||||
|
||||
// Update trailing matrix
|
||||
for (int i = k + 1; i < nt; i++) {
|
||||
for (int j = k + 1; j < i; j++) {
|
||||
#pragma omp task
|
||||
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, ts, ts, ts, -1.0, Ah[k][i], ts, Ah[k][j], ts, 1.0, Ah[j][i], ts);
|
||||
}
|
||||
#pragma omp task
|
||||
cblas_dsyrk(CblasColMajor, CblasLower, CblasNoTrans, ts, ts, -1.0, Ah[k][i], ts, 1.0, Ah[i][i], ts);
|
||||
}
|
||||
#pragma omp taskwait
|
||||
}
|
||||
|
||||
#ifdef VERBOSE
|
||||
printf("> ...end of Cholesky Factorization.\n");
|
||||
#endif
|
||||
}
|
||||
|
||||
float get_time()
|
||||
{
|
||||
static double gtod_ref_time_sec = 0.0;
|
||||
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, NULL);
|
||||
|
||||
// If this is the first invocation of through dclock(), then initialize the
|
||||
// "reference time" global variable to the seconds field of the tv struct.
|
||||
if (gtod_ref_time_sec == 0.0) gtod_ref_time_sec = (double) tv.tv_sec;
|
||||
|
||||
// Normalize the seconds field of the tv struct so that it is relative to the
|
||||
// "reference time" that was recorded during the first invocation of dclock().
|
||||
const double norm_sec = (double) tv.tv_sec - gtod_ref_time_sec;
|
||||
|
||||
// Compute the number of seconds since the reference time.
|
||||
const double t = norm_sec + tv.tv_usec * 1.0e-6;
|
||||
|
||||
return (float) t;
|
||||
}
|
||||
|
||||
// Robust Check the factorization of the matrix A2
|
||||
// Using directly Fortran services: dlacpy_, dtrmm, dlange_
|
||||
static int check_factorization(int N, double *A1, double *A2, int LDA, char uplo, double eps)
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Checking the Cholesky Factorization... \n");
|
||||
#endif
|
||||
char NORM = 'I', ALL = 'A', UP = 'U', LO = 'L', TR = 'T', NU = 'N', RI = 'R';
|
||||
|
||||
double *Residual = (double *) malloc(N*N*sizeof(double));
|
||||
double *L1 = (double *) malloc(N*N*sizeof(double));
|
||||
double *L2 = (double *) malloc(N*N*sizeof(double));
|
||||
double *work = (double *) malloc(N*sizeof(double));
|
||||
|
||||
memset((void*)L1, 0, N*N*sizeof(double));
|
||||
memset((void*)L2, 0, N*N*sizeof(double));
|
||||
|
||||
double alpha= 1.0;
|
||||
|
||||
dlacpy_(&ALL, &N, &N, A1, &LDA, Residual, &N);
|
||||
|
||||
/* Dealing with L'L or U'U */
|
||||
if (uplo == 'U'){
|
||||
dlacpy_(&UP, &N, &N, A2, &LDA, L1, &N);
|
||||
dlacpy_(&UP, &N, &N, A2, &LDA, L2, &N);
|
||||
dtrmm(&LO, &uplo, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
|
||||
} else{
|
||||
dlacpy_(&LO, &N, &N, A2, &LDA, L1, &N);
|
||||
dlacpy_(&LO, &N, &N, A2, &LDA, L2, &N);
|
||||
dtrmm(&RI, &LO, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
|
||||
}
|
||||
|
||||
/* Compute the Residual || A -L'L|| */
|
||||
for (int i = 0; i < N; i++)
|
||||
for (int j = 0; j < N; j++)
|
||||
Residual[j*N+i] = L2[j*N+i] - Residual[j*N+i];
|
||||
|
||||
double Rnorm = dlange_(&NORM, &N, &N, Residual, &N, work);
|
||||
double Anorm = dlange_(&NORM, &N, &N, A1, &N, work);
|
||||
|
||||
#ifdef VERBOSE
|
||||
printf("> - ||L'L-A||_oo/(||A||_oo.N.eps) = %e \n",Rnorm / (Anorm*N*eps));
|
||||
#endif
|
||||
|
||||
const int info_factorization = isnan(Rnorm/(Anorm*N*eps)) || isinf(Rnorm/(Anorm*N*eps)) || (Rnorm/(Anorm*N*eps) > 60.0);
|
||||
|
||||
#ifdef VERBOSE
|
||||
if ( info_factorization) printf("> - Factorization is suspicious!\n");
|
||||
else printf("> - Factorization is CORRECT!\n");
|
||||
#endif
|
||||
|
||||
free(Residual); free(L1); free(L2); free(work);
|
||||
|
||||
return info_factorization;
|
||||
}
|
||||
|
||||
void initialize_matrix(const int n, const int ts, double *matrix)
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Initializing matrix with random values...\n");
|
||||
#endif
|
||||
|
||||
int ISEED[4] = {0,0,0,1};
|
||||
int intONE=1;
|
||||
|
||||
for (int i = 0; i < n*n; i+=n) {
|
||||
dlarnv_(&intONE, &ISEED[0], &n, &matrix[i]);
|
||||
}
|
||||
|
||||
for (int i=0; i<n; i++) {
|
||||
for (int j=0; j<n; j++) {
|
||||
matrix[j*n+i] = matrix[j*n+i] + matrix[i*n+j];
|
||||
matrix[i*n+j] = matrix[j*n+i];
|
||||
}
|
||||
}
|
||||
|
||||
// Diagonal values
|
||||
for (int i = 0; i < n; i++) {
|
||||
matrix[i*n+i] += (double) n;
|
||||
}
|
||||
}
|
||||
|
||||
void gather_block(const int N, const int ts, double *Alin, double *A)
|
||||
{
|
||||
for (int i = 0; i < ts; i++)
|
||||
for (int j = 0; j < ts; j++) {
|
||||
A[i*ts + j] = Alin[i*N + j];
|
||||
}
|
||||
}
|
||||
|
||||
void scatter_block(const int N, const int ts, double *A, double *Alin)
|
||||
{
|
||||
for (int i = 0; i < ts; i++)
|
||||
for (int j = 0; j < ts; j++) {
|
||||
Alin[i*N + j] = A[i*ts + j];
|
||||
}
|
||||
}
|
||||
|
||||
void convert_to_blocks(const int ts, const int DIM, const int N, double Alin[N][N], double *A[DIM][DIM])
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Converting linear matrix to blocks...\n");
|
||||
#endif
|
||||
for (int i = 0; i < DIM; i++)
|
||||
for (int j = 0; j < DIM; j++) {
|
||||
gather_block ( N, ts, &Alin[i*ts][j*ts], A[i][j]);
|
||||
}
|
||||
}
|
||||
|
||||
void convert_to_linear(const int ts, const int DIM, const int N, double *A[DIM][DIM], double Alin[N][N])
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Converting blocked matrix to linear...\n");
|
||||
#endif
|
||||
for (int i = 0; i < DIM; i++)
|
||||
for (int j = 0; j < DIM; j++) {
|
||||
scatter_block ( N, ts, A[i][j], (double *) &Alin[i*ts][j*ts]);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
char *result[3] = {"n/a","pass","FAIL"};
|
||||
const double eps = pow(2.0, -53);
|
||||
|
||||
// If number of arguments is not 4, print help
|
||||
if ( argc != 4) {
|
||||
printf( "%s: matrix_size block_size check[0|1]?\n", argv[0] );
|
||||
exit( -1 );
|
||||
}
|
||||
|
||||
const int n = atoi(argv[1]); // matrix size
|
||||
const int ts = atoi(argv[2]); // tile size
|
||||
int check = atoi(argv[3]); // check result?
|
||||
|
||||
// Compute number of tiles
|
||||
const int nt = n / ts;
|
||||
assert((nt*ts) == n); // tile size should divide size
|
||||
|
||||
// Allocate matrix
|
||||
double * const matrix = (double *) malloc(n * n * sizeof(double));
|
||||
assert(matrix != NULL);
|
||||
|
||||
// Initialize matrix
|
||||
initialize_matrix(n, ts, matrix);
|
||||
|
||||
// Allocate original matrix, and duplicate it, for debugging purposes
|
||||
double * const original_matrix = (double *) malloc(n * n * sizeof(double));
|
||||
assert(original_matrix != NULL);
|
||||
|
||||
// Save a copy of the original matrix
|
||||
for (int i = 0; i < n * n; i++ ) {
|
||||
original_matrix[i] = matrix[i];
|
||||
}
|
||||
|
||||
// Set version description: Indirect blocked matrix
|
||||
const char *version = "I-Blocked matrix";
|
||||
|
||||
// Allocate blocked matrix
|
||||
double *Ah[nt][nt];
|
||||
for (int i = 0; i < nt; i++) {
|
||||
for (int j = 0; j < nt; j++) {
|
||||
Ah[i][j] = malloc(ts * ts * sizeof(double));
|
||||
assert(Ah[i][j] != NULL);
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------
|
||||
// Convert, compute (time), and re-convert
|
||||
// ---------------------------------------
|
||||
convert_to_blocks(ts, nt, n, (double(*)[n]) matrix, Ah);
|
||||
const float tref = get_time();
|
||||
cholesky(ts, nt, (double* (*)[nt]) Ah);
|
||||
const float time = get_time() - tref;
|
||||
convert_to_linear(ts, nt, n, Ah, (double (*)[n]) matrix);
|
||||
|
||||
// Free blocked matrix
|
||||
for (int i = 0; i < nt; i++) {
|
||||
for (int j = 0; j < nt; j++) {
|
||||
assert(Ah[i][j] != NULL);
|
||||
free(Ah[i][j]);
|
||||
}
|
||||
}
|
||||
|
||||
// Check result, if requested
|
||||
if ( check ) {
|
||||
const char uplo = 'L';
|
||||
if ( check_factorization( n, original_matrix, matrix, n, uplo, eps) ) check++;
|
||||
}
|
||||
|
||||
// Free original matrix, not needed anymore
|
||||
free(original_matrix);
|
||||
|
||||
// Compute GFLOPs (Not verified)
|
||||
float gflops = (((1.0 / 3.0) * n * n * n) / ((time) * 1.0e+9));
|
||||
|
||||
// Print results
|
||||
#ifdef VERBOSE
|
||||
printf( "\n" );
|
||||
printf( "============ CHOLESKY RESULTS ============\n" );
|
||||
printf( " test %s\n", argv[0]);
|
||||
printf( " version %s\n", version);
|
||||
printf( " matrix size: %dx%d\n", n, n);
|
||||
printf( " tile size: %dx%d\n", ts, ts);
|
||||
printf( " number of threads: %d\n", omp_get_max_threads());
|
||||
printf( " time (s): %f\n", time);
|
||||
printf( " performance (gflops): %f\n", gflops);
|
||||
printf( " check: %s\n", result[check]);
|
||||
printf( "==========================================\n" );
|
||||
#else
|
||||
printf("test, %s, version, %s, n, %d, ts, %d, num_threads, %d, gflops, %f, time, %f, check, %s\n", argv[0], version, n, ts, omp_get_max_threads(), gflops, time, result[check]);
|
||||
#endif
|
||||
|
||||
// Free matrix
|
||||
free(matrix);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
292
01-tasks/cholesky-tsk/cholesky.c
Normal file
292
01-tasks/cholesky-tsk/cholesky.c
Normal file
|
@ -0,0 +1,292 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <errno.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include <sys/time.h>
|
||||
#include <sys/times.h>
|
||||
|
||||
#include <math.h>
|
||||
#include <mkl.h>
|
||||
|
||||
#include "omp.h"
|
||||
|
||||
#if !defined(_OPENMP)
|
||||
int omp_get_max_threads() { return 1; }
|
||||
int omp_get_num_threads() { return 1; }
|
||||
#endif
|
||||
|
||||
void cholesky(int ts, int nt, double* Ah[nt][nt])
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Computing Cholesky Factorization: indirect blocked matrix...\n");
|
||||
#endif
|
||||
|
||||
// TODO: Parallel region is required in order to run in parallel.
|
||||
// TODO: Kernel invocations could be task candidates.
|
||||
// TODO: Add scheduler restrictions: taskwait, taskgroup, etc.
|
||||
for (int k = 0; k < nt; k++) {
|
||||
// Diagonal Block factorization: using LAPACK
|
||||
LAPACKE_dpotrf(LAPACK_COL_MAJOR, 'L', ts, Ah[k][k], ts);
|
||||
|
||||
// Triangular systems
|
||||
for (int i = k + 1; i < nt; i++) {
|
||||
cblas_dtrsm(CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, ts, ts, 1.0, Ah[k][k], ts, Ah[k][i], ts);
|
||||
}
|
||||
|
||||
// Update trailing matrix
|
||||
for (int i = k + 1; i < nt; i++) {
|
||||
for (int j = k + 1; j < i; j++) {
|
||||
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, ts, ts, ts, -1.0, Ah[k][i], ts, Ah[k][j], ts, 1.0, Ah[j][i], ts);
|
||||
}
|
||||
cblas_dsyrk(CblasColMajor, CblasLower, CblasNoTrans, ts, ts, -1.0, Ah[k][i], ts, 1.0, Ah[i][i], ts);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef VERBOSE
|
||||
printf("> ...end of Cholesky Factorization.\n");
|
||||
#endif
|
||||
}
|
||||
|
||||
float get_time()
|
||||
{
|
||||
static double gtod_ref_time_sec = 0.0;
|
||||
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, NULL);
|
||||
|
||||
// If this is the first invocation of through dclock(), then initialize the
|
||||
// "reference time" global variable to the seconds field of the tv struct.
|
||||
if (gtod_ref_time_sec == 0.0) gtod_ref_time_sec = (double) tv.tv_sec;
|
||||
|
||||
// Normalize the seconds field of the tv struct so that it is relative to the
|
||||
// "reference time" that was recorded during the first invocation of dclock().
|
||||
const double norm_sec = (double) tv.tv_sec - gtod_ref_time_sec;
|
||||
|
||||
// Compute the number of seconds since the reference time.
|
||||
const double t = norm_sec + tv.tv_usec * 1.0e-6;
|
||||
|
||||
return (float) t;
|
||||
}
|
||||
|
||||
// Robust Check the factorization of the matrix A2
|
||||
// Using directly Fortran services: dlacpy_, dtrmm, dlange_
|
||||
static int check_factorization(int N, double *A1, double *A2, int LDA, char uplo, double eps)
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Checking the Cholesky Factorization... \n");
|
||||
#endif
|
||||
char NORM = 'I', ALL = 'A', UP = 'U', LO = 'L', TR = 'T', NU = 'N', RI = 'R';
|
||||
|
||||
double *Residual = (double *) malloc(N*N*sizeof(double));
|
||||
double *L1 = (double *) malloc(N*N*sizeof(double));
|
||||
double *L2 = (double *) malloc(N*N*sizeof(double));
|
||||
double *work = (double *) malloc(N*sizeof(double));
|
||||
|
||||
memset((void*)L1, 0, N*N*sizeof(double));
|
||||
memset((void*)L2, 0, N*N*sizeof(double));
|
||||
|
||||
double alpha= 1.0;
|
||||
|
||||
dlacpy_(&ALL, &N, &N, A1, &LDA, Residual, &N);
|
||||
|
||||
/* Dealing with L'L or U'U */
|
||||
if (uplo == 'U'){
|
||||
dlacpy_(&UP, &N, &N, A2, &LDA, L1, &N);
|
||||
dlacpy_(&UP, &N, &N, A2, &LDA, L2, &N);
|
||||
dtrmm(&LO, &uplo, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
|
||||
} else{
|
||||
dlacpy_(&LO, &N, &N, A2, &LDA, L1, &N);
|
||||
dlacpy_(&LO, &N, &N, A2, &LDA, L2, &N);
|
||||
dtrmm(&RI, &LO, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
|
||||
}
|
||||
|
||||
/* Compute the Residual || A -L'L|| */
|
||||
for (int i = 0; i < N; i++)
|
||||
for (int j = 0; j < N; j++)
|
||||
Residual[j*N+i] = L2[j*N+i] - Residual[j*N+i];
|
||||
|
||||
double Rnorm = dlange_(&NORM, &N, &N, Residual, &N, work);
|
||||
double Anorm = dlange_(&NORM, &N, &N, A1, &N, work);
|
||||
|
||||
#ifdef VERBOSE
|
||||
printf("> - ||L'L-A||_oo/(||A||_oo.N.eps) = %e \n",Rnorm / (Anorm*N*eps));
|
||||
#endif
|
||||
|
||||
const int info_factorization = isnan(Rnorm/(Anorm*N*eps)) || isinf(Rnorm/(Anorm*N*eps)) || (Rnorm/(Anorm*N*eps) > 60.0);
|
||||
|
||||
#ifdef VERBOSE
|
||||
if ( info_factorization) printf("> - Factorization is suspicious!\n");
|
||||
else printf("> - Factorization is CORRECT!\n");
|
||||
#endif
|
||||
|
||||
free(Residual); free(L1); free(L2); free(work);
|
||||
|
||||
return info_factorization;
|
||||
}
|
||||
|
||||
void initialize_matrix(const int n, const int ts, double *matrix)
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Initializing matrix with random values...\n");
|
||||
#endif
|
||||
|
||||
int ISEED[4] = {0,0,0,1};
|
||||
int intONE=1;
|
||||
|
||||
for (int i = 0; i < n*n; i+=n) {
|
||||
dlarnv_(&intONE, &ISEED[0], &n, &matrix[i]);
|
||||
}
|
||||
|
||||
for (int i=0; i<n; i++) {
|
||||
for (int j=0; j<n; j++) {
|
||||
matrix[j*n+i] = matrix[j*n+i] + matrix[i*n+j];
|
||||
matrix[i*n+j] = matrix[j*n+i];
|
||||
}
|
||||
}
|
||||
|
||||
// Diagonal values
|
||||
for (int i = 0; i < n; i++) {
|
||||
matrix[i*n+i] += (double) n;
|
||||
}
|
||||
}
|
||||
|
||||
void gather_block(const int N, const int ts, double *Alin, double *A)
|
||||
{
|
||||
for (int i = 0; i < ts; i++)
|
||||
for (int j = 0; j < ts; j++) {
|
||||
A[i*ts + j] = Alin[i*N + j];
|
||||
}
|
||||
}
|
||||
|
||||
void scatter_block(const int N, const int ts, double *A, double *Alin)
|
||||
{
|
||||
for (int i = 0; i < ts; i++)
|
||||
for (int j = 0; j < ts; j++) {
|
||||
Alin[i*N + j] = A[i*ts + j];
|
||||
}
|
||||
}
|
||||
|
||||
void convert_to_blocks(const int ts, const int DIM, const int N, double Alin[N][N], double *A[DIM][DIM])
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Converting linear matrix to blocks...\n");
|
||||
#endif
|
||||
for (int i = 0; i < DIM; i++)
|
||||
for (int j = 0; j < DIM; j++) {
|
||||
gather_block ( N, ts, &Alin[i*ts][j*ts], A[i][j]);
|
||||
}
|
||||
}
|
||||
|
||||
void convert_to_linear(const int ts, const int DIM, const int N, double *A[DIM][DIM], double Alin[N][N])
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Converting blocked matrix to linear...\n");
|
||||
#endif
|
||||
for (int i = 0; i < DIM; i++)
|
||||
for (int j = 0; j < DIM; j++) {
|
||||
scatter_block ( N, ts, A[i][j], (double *) &Alin[i*ts][j*ts]);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
char *result[3] = {"n/a","pass","FAIL"};
|
||||
const double eps = pow(2.0, -53);
|
||||
|
||||
// If number of arguments is not 4, print help
|
||||
if ( argc != 4) {
|
||||
printf( "%s: matrix_size block_size check[0|1]?\n", argv[0] );
|
||||
exit( -1 );
|
||||
}
|
||||
|
||||
const int n = atoi(argv[1]); // matrix size
|
||||
const int ts = atoi(argv[2]); // tile size
|
||||
int check = atoi(argv[3]); // check result?
|
||||
|
||||
// Compute number of tiles
|
||||
const int nt = n / ts;
|
||||
assert((nt*ts) == n); // tile size should divide size
|
||||
|
||||
// Allocate matrix
|
||||
double * const matrix = (double *) malloc(n * n * sizeof(double));
|
||||
assert(matrix != NULL);
|
||||
|
||||
// Initialize matrix
|
||||
initialize_matrix(n, ts, matrix);
|
||||
|
||||
// Allocate original matrix, and duplicate it, for debugging purposes
|
||||
double * const original_matrix = (double *) malloc(n * n * sizeof(double));
|
||||
assert(original_matrix != NULL);
|
||||
|
||||
// Save a copy of the original matrix
|
||||
for (int i = 0; i < n * n; i++ ) {
|
||||
original_matrix[i] = matrix[i];
|
||||
}
|
||||
|
||||
// Set version description: Indirect blocked matrix
|
||||
const char *version = "I-Blocked matrix";
|
||||
|
||||
// Allocate blocked matrix
|
||||
double *Ah[nt][nt];
|
||||
for (int i = 0; i < nt; i++) {
|
||||
for (int j = 0; j < nt; j++) {
|
||||
Ah[i][j] = malloc(ts * ts * sizeof(double));
|
||||
assert(Ah[i][j] != NULL);
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------
|
||||
// Convert, compute (time), and re-convert
|
||||
// ---------------------------------------
|
||||
convert_to_blocks(ts, nt, n, (double(*)[n]) matrix, Ah);
|
||||
const float tref = get_time();
|
||||
cholesky(ts, nt, (double* (*)[nt]) Ah);
|
||||
const float time = get_time() - tref;
|
||||
convert_to_linear(ts, nt, n, Ah, (double (*)[n]) matrix);
|
||||
|
||||
// Free blocked matrix
|
||||
for (int i = 0; i < nt; i++) {
|
||||
for (int j = 0; j < nt; j++) {
|
||||
assert(Ah[i][j] != NULL);
|
||||
free(Ah[i][j]);
|
||||
}
|
||||
}
|
||||
|
||||
// Check result, if requested
|
||||
if ( check ) {
|
||||
const char uplo = 'L';
|
||||
if ( check_factorization( n, original_matrix, matrix, n, uplo, eps) ) check++;
|
||||
}
|
||||
|
||||
// Free original matrix, not needed anymore
|
||||
free(original_matrix);
|
||||
|
||||
// Compute GFLOPs (Not verified)
|
||||
float gflops = (((1.0 / 3.0) * n * n * n) / ((time) * 1.0e+9));
|
||||
|
||||
// Print results
|
||||
#ifdef VERBOSE
|
||||
printf( "\n" );
|
||||
printf( "============ CHOLESKY RESULTS ============\n" );
|
||||
printf( " test %s\n", argv[0]);
|
||||
printf( " version %s\n", version);
|
||||
printf( " matrix size: %dx%d\n", n, n);
|
||||
printf( " tile size: %dx%d\n", ts, ts);
|
||||
printf( " number of threads: %d\n", omp_get_max_threads());
|
||||
printf( " time (s): %f\n", time);
|
||||
printf( " performance (gflops): %f\n", gflops);
|
||||
printf( " check: %s\n", result[check]);
|
||||
printf( "==========================================\n" );
|
||||
#else
|
||||
printf("test, %s, version, %s, n, %d, ts, %d, num_threads, %d, gflops, %f, time, %f, check, %s\n", argv[0], version, n, ts, omp_get_max_threads(), gflops, time, result[check]);
|
||||
#endif
|
||||
|
||||
// Free matrix
|
||||
free(matrix);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
25
01-tasks/sudoku-tsk/Makefile
Normal file
25
01-tasks/sudoku-tsk/Makefile
Normal file
|
@ -0,0 +1,25 @@
|
|||
BIN_DIR=../bin
|
||||
PROGRAM=$(BIN_DIR)/sudoku-tsk
|
||||
|
||||
CC=g++
|
||||
CFLAGS=-c -O3 -fopenmp -DUSE_SEQUENTIAL_CUTOFF
|
||||
LDFLAGS=-fopenmp
|
||||
|
||||
OBJS=SudokuBoard.o sudoku.o
|
||||
|
||||
all: $(PROGRAM)
|
||||
|
||||
$(PROGRAM): $(OBJS)
|
||||
$(CC) -o $@ $^ $(LDFLAGS)
|
||||
|
||||
%.o: %.cpp
|
||||
$(CC) $(CFLAGS) -o $@ $^
|
||||
|
||||
$(BIN_DIR):
|
||||
mkdir $@
|
||||
|
||||
clean:
|
||||
rm -rf $(PROGRAM) *.o
|
||||
|
||||
wipe: clean
|
||||
rm -rf *.out *.err
|
119
01-tasks/sudoku-tsk/SudokuBoard.cpp
Normal file
119
01-tasks/sudoku-tsk/SudokuBoard.cpp
Normal file
|
@ -0,0 +1,119 @@
|
|||
/*
|
||||
* This file is part of Christian's OpenMP Task-parallel Sudoku Solver
|
||||
*
|
||||
* Copyright (C) 2013 by Christian Terboven <christian@terboven.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "SudokuBoard.h"
|
||||
|
||||
#include <cstring>
|
||||
|
||||
CSudokuBoard::CSudokuBoard(int fsize, int bsize)
|
||||
: field_size(fsize), block_size(bsize), solutions(-1)
|
||||
{
|
||||
field = new int[field_size*field_size];
|
||||
}
|
||||
|
||||
|
||||
CSudokuBoard::CSudokuBoard(const CSudokuBoard& other)
|
||||
: field_size(other.getFieldSize()), block_size(other.getBlockSize()), solutions(other.getNumSolutions())
|
||||
{
|
||||
field = new int[field_size*field_size];
|
||||
std::memcpy(field, other.field, sizeof(int) * field_size*field_size);
|
||||
}
|
||||
|
||||
|
||||
CSudokuBoard::~CSudokuBoard(void)
|
||||
{
|
||||
delete[] field;
|
||||
}
|
||||
|
||||
|
||||
bool CSudokuBoard::loadFromFile(char *filename)
|
||||
{
|
||||
std::ifstream ifile(filename);
|
||||
|
||||
if (!ifile) {
|
||||
std::cout << "There was an error opening the input file " << filename << std::endl;
|
||||
std::cout << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 0; i < this->field_size; i++) {
|
||||
for (int j = 0; j < this->field_size; j++) {
|
||||
ifile >> this->field[ACCESS(i,j)];
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
void CSudokuBoard::printBoard()
|
||||
{
|
||||
for(int i = 0; i < field_size; i++) {
|
||||
for(int j = 0; j < field_size; j++) {
|
||||
std::cout << std::setw(3) <<
|
||||
this->field[ACCESS(i,j)]
|
||||
<< " ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
bool CSudokuBoard::check(int x, int y, int value)
|
||||
{
|
||||
if(checkHorizontal(y, value))
|
||||
return true;
|
||||
if(checkVertical(x, value))
|
||||
return true;
|
||||
if(checkBox(x, y, value))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
bool CSudokuBoard::checkHorizontal(int y, int value) {
|
||||
for(int i = 0; i < field_size; i++)
|
||||
if(field[ACCESS(y,i)] == value)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
bool CSudokuBoard::checkVertical(int x, int value) {
|
||||
for(int i = 0; i < field_size; i++)
|
||||
if(field[ACCESS(i,x)] == value)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
bool CSudokuBoard::checkBox(int x, int y, int value) {
|
||||
// find suitable box edge
|
||||
int x_box = (int)(x / block_size) * block_size;
|
||||
int y_box = (int)(y / block_size) * block_size;
|
||||
|
||||
for(int i = y_box; i < y_box + block_size; i++)
|
||||
for(int j = x_box; j < x_box + block_size; j++)
|
||||
if(field[ACCESS(i,j)] == value)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
96
01-tasks/sudoku-tsk/SudokuBoard.h
Normal file
96
01-tasks/sudoku-tsk/SudokuBoard.h
Normal file
|
@ -0,0 +1,96 @@
|
|||
/*
|
||||
* This file is part of Christian's OpenMP Task-parallel Sudoku Solver
|
||||
*
|
||||
* Copyright (C) 2013 by Christian Terboven <christian@terboven.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
#include <fstream>
|
||||
#include <vector>
|
||||
|
||||
#define ACCESS(x, y) (this->field_size*(x) + (y))
|
||||
|
||||
/**
|
||||
* @brief Representation of the Sudoku board, includes solver and utility functions
|
||||
*/
|
||||
class CSudokuBoard
|
||||
{
|
||||
public:
|
||||
CSudokuBoard(int fsize, int bsize);
|
||||
CSudokuBoard(const CSudokuBoard& other);
|
||||
~CSudokuBoard(void);
|
||||
|
||||
inline int getNumSolutions() const {
|
||||
return this->solutions;
|
||||
}
|
||||
|
||||
inline void incrementSolutionCounter() {
|
||||
if (this->solutions == -1)
|
||||
this->solutions = 1;
|
||||
else
|
||||
this->solutions++;
|
||||
}
|
||||
|
||||
inline int getFieldSize() const {
|
||||
return this->field_size;
|
||||
}
|
||||
|
||||
inline int getBlockSize() const {
|
||||
return this->block_size;
|
||||
}
|
||||
|
||||
inline int get(int x, int y) const {
|
||||
return this->field[ACCESS(x,y)];
|
||||
}
|
||||
|
||||
inline void set(int x, int y, int value) {
|
||||
this->field[ACCESS(x, y)] = value;
|
||||
}
|
||||
|
||||
/**
|
||||
* Read Sudoku template from file
|
||||
* @param filename name of file to read input from
|
||||
* @return true if reading file was successful, false if not
|
||||
*/
|
||||
bool loadFromFile(char *filename);
|
||||
|
||||
/**
|
||||
* Print the Sudoku board to stdout
|
||||
*/
|
||||
void printBoard();
|
||||
|
||||
/**
|
||||
* Check whether value already used
|
||||
* @return true if already used, false if not
|
||||
*/
|
||||
bool check(int x, int y, int value);
|
||||
|
||||
|
||||
private:
|
||||
bool checkHorizontal(int y, int value);
|
||||
bool checkVertical(int x, int value);
|
||||
bool checkBox(int x, int y, int value);
|
||||
|
||||
int field_size;
|
||||
int block_size;
|
||||
|
||||
int *field;
|
||||
|
||||
int solutions;
|
||||
};
|
177
01-tasks/sudoku-tsk/sudoku-solved.cpp
Normal file
177
01-tasks/sudoku-tsk/sudoku-solved.cpp
Normal file
|
@ -0,0 +1,177 @@
|
|||
/*
|
||||
* This file is part of Christian's OpenMP Task-parallel Sudoku Solver
|
||||
*
|
||||
* Copyright (C) 2013 by Christian Terboven <christian@terboven.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <omp.h>
|
||||
|
||||
#include "SudokuBoard.h"
|
||||
|
||||
|
||||
int found_sudokus = 0;
|
||||
|
||||
|
||||
/**
|
||||
* Solve the Sudoku puzzle by finding and counting all solutions (if all are requested)
|
||||
*/
|
||||
bool solve(int x, int y, CSudokuBoard* sudoku, bool printAnyFoundSolution = true, bool findAllSolutions = true)
|
||||
{
|
||||
if(x == sudoku->getFieldSize()) { // end of line
|
||||
y++;
|
||||
x = 0;
|
||||
if(y == sudoku->getFieldSize()) // end
|
||||
return true;
|
||||
}
|
||||
|
||||
if(sudoku->get(y, x) > 0) // field already set
|
||||
return solve(x+1, y, sudoku, printAnyFoundSolution, findAllSolutions); // tackle next field
|
||||
|
||||
for(int i = 1; i <= sudoku->getFieldSize(); i++) { // try all numbers
|
||||
if(!sudoku->check(x, y, i)) {
|
||||
sudoku->set(y, x, i); // if number fits, set it
|
||||
if(solve(x+1, y, sudoku, printAnyFoundSolution, findAllSolutions)) { // tackle next field
|
||||
#pragma omp atomic
|
||||
found_sudokus++;
|
||||
sudoku->incrementSolutionCounter(); // solution found :-)
|
||||
if (printAnyFoundSolution) {
|
||||
std::cout << "The following is a valid solution:" << std::endl;
|
||||
sudoku->printBoard();
|
||||
std::cout << std::endl;
|
||||
}
|
||||
if (!findAllSolutions) {
|
||||
return true; // return, as only one solution was asked for
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sudoku->set(y, x, 0); // no solution found, reset field
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Solve the Sudoku puzzle in parallel by finding and counting all solutions
|
||||
*/
|
||||
bool solve_parallel(int x, int y, CSudokuBoard* sudoku, bool printAnyFoundSolution = true)
|
||||
{
|
||||
if (x == sudoku->getFieldSize()) { // end of line
|
||||
y++;
|
||||
x = 0;
|
||||
if(y == sudoku->getFieldSize()) // end
|
||||
return true;
|
||||
}
|
||||
|
||||
if (sudoku->get(y, x) > 0) { // field already set
|
||||
return solve_parallel(x+1, y, sudoku, printAnyFoundSolution); // tackle next field
|
||||
}
|
||||
|
||||
#if USE_SEQUENTIAL_CUTOFF
|
||||
if ( y > 1 ) {
|
||||
return solve(x, y, sudoku, printAnyFoundSolution);
|
||||
}
|
||||
#endif
|
||||
|
||||
for (int i = 1; i <= sudoku->getFieldSize(); i++) { // try all numbers
|
||||
if (!sudoku->check(x, y, i)) {
|
||||
#pragma omp task firstprivate(i,x,y,sudoku)
|
||||
{
|
||||
CSudokuBoard new_sudoku(*sudoku);
|
||||
new_sudoku.set(y, x, i); // if number fits, set it
|
||||
|
||||
if (solve_parallel(x+1, y, &new_sudoku, printAnyFoundSolution)) { // tackle next field
|
||||
#pragma omp atomic
|
||||
found_sudokus++;
|
||||
// sudoku->incrementSolutionCounter(); // solution found :-)
|
||||
if (printAnyFoundSolution) {
|
||||
std::cout << "The following is a valid solution:" << std::endl;
|
||||
new_sudoku.printBoard();
|
||||
std::cout << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#pragma omp taskwait
|
||||
sudoku->set(y, x, 0); // no solution found, reset field
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/** @brief program entry point
|
||||
*/
|
||||
int main(int argc, char* argv[]) {
|
||||
// measure the time
|
||||
double t3, t4;
|
||||
int nthreads = 0;
|
||||
|
||||
// expect three command line arguments: field size, block size, and input file
|
||||
if (argc != 4) {
|
||||
std::cout << "Usage: sudoku.exe <field size> <block size> <input filename>" << std::endl;
|
||||
std::cout << std::endl;
|
||||
return -1;
|
||||
}
|
||||
else {
|
||||
CSudokuBoard *sudoku1 = new CSudokuBoard(atoi(argv[1]), atoi(argv[2]));
|
||||
if (!sudoku1->loadFromFile(argv[3])) {
|
||||
std::cout << "There was an error reading a Sudoku template from " << argv[3] << std::endl;
|
||||
std::cout << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
// print the Sudoku board template
|
||||
std::cout << "Given Sudoku template" << std::endl;
|
||||
sudoku1->printBoard();
|
||||
|
||||
// solve the Sudoku by finding (and printing) all solutions
|
||||
t3 = omp_get_wtime();
|
||||
#pragma omp parallel sections
|
||||
{
|
||||
solve_parallel(0, 0, sudoku1, false);
|
||||
}
|
||||
t4 = omp_get_wtime();
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
#pragma omp master
|
||||
nthreads = omp_get_num_threads();
|
||||
}
|
||||
|
||||
std::cout << std::endl;
|
||||
std::cout << "In total there were " << sudoku1->getNumSolutions() << " solutions found in serial." << std::endl;
|
||||
std::cout << "In total there were " << found_sudokus << " solutions found in parallel " << std::endl;
|
||||
std::cout << std::endl;
|
||||
|
||||
delete sudoku1;
|
||||
}
|
||||
|
||||
// print the time
|
||||
std::cout << "Parallel computation took " << t4 - t3 << " seconds ("
|
||||
<< nthreads << " threads)." << std::endl << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
169
01-tasks/sudoku-tsk/sudoku.cpp
Normal file
169
01-tasks/sudoku-tsk/sudoku.cpp
Normal file
|
@ -0,0 +1,169 @@
|
|||
/*
|
||||
* This file is part of Christian's OpenMP Task-parallel Sudoku Solver
|
||||
*
|
||||
* Copyright (C) 2013 by Christian Terboven <christian@terboven.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <omp.h>
|
||||
|
||||
#include "SudokuBoard.h"
|
||||
|
||||
|
||||
int found_sudokus = 0;
|
||||
|
||||
|
||||
/**
|
||||
* Solve the Sudoku puzzle by finding and counting all solutions (if all are requested)
|
||||
*/
|
||||
bool solve(int x, int y, CSudokuBoard* sudoku, bool printAnyFoundSolution = true, bool findAllSolutions = true)
|
||||
{
|
||||
if(x == sudoku->getFieldSize()) { // end of line
|
||||
y++;
|
||||
x = 0;
|
||||
if(y == sudoku->getFieldSize()) // end
|
||||
return true;
|
||||
}
|
||||
|
||||
if(sudoku->get(y, x) > 0) // field already set
|
||||
return solve(x+1, y, sudoku, printAnyFoundSolution, findAllSolutions); // tackle next field
|
||||
|
||||
for(int i = 1; i <= sudoku->getFieldSize(); i++) { // try all numbers
|
||||
if(!sudoku->check(x, y, i)) {
|
||||
sudoku->set(y, x, i); // if number fits, set it
|
||||
if(solve(x+1, y, sudoku, printAnyFoundSolution, findAllSolutions)) { // tackle next field
|
||||
#pragma omp atomic
|
||||
found_sudokus++;
|
||||
sudoku->incrementSolutionCounter(); // solution found :-)
|
||||
if (printAnyFoundSolution) {
|
||||
std::cout << "The following is a valid solution:" << std::endl;
|
||||
sudoku->printBoard();
|
||||
std::cout << std::endl;
|
||||
}
|
||||
if (!findAllSolutions) {
|
||||
return true; // return, as only one solution was asked for
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sudoku->set(y, x, 0); // no solution found, reset field
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Solve the Sudoku puzzle in parallel by finding and counting all solutions
|
||||
*/
|
||||
bool solve_parallel(int x, int y, CSudokuBoard* sudoku, bool printAnyFoundSolution = true)
|
||||
{
|
||||
if (x == sudoku->getFieldSize()) { // end of line
|
||||
y++;
|
||||
x = 0;
|
||||
if(y == sudoku->getFieldSize()) // end
|
||||
return true;
|
||||
}
|
||||
|
||||
if (sudoku->get(y, x) > 0) { // field already set
|
||||
return solve_parallel(x+1, y, sudoku, printAnyFoundSolution); // tackle next field
|
||||
}
|
||||
|
||||
#if USE_SEQUENTIAL_CUTOFF
|
||||
if ( y > 1 ) {
|
||||
return solve(x, y, sudoku, printAnyFoundSolution);
|
||||
}
|
||||
#endif
|
||||
|
||||
// TODO: Task candidates (for each possible value?)
|
||||
for (int i = 1; i <= sudoku->getFieldSize(); i++) { // try all numbers
|
||||
if (!sudoku->check(x, y, i)) {
|
||||
{
|
||||
CSudokuBoard new_sudoku(*sudoku);
|
||||
new_sudoku.set(y, x, i); // if number fits, set it
|
||||
|
||||
if (solve_parallel(x+1, y, &new_sudoku, printAnyFoundSolution)) { // tackle next field
|
||||
#pragma omp atomic
|
||||
found_sudokus++;
|
||||
// sudoku->incrementSolutionCounter(); // solution found :-)
|
||||
if (printAnyFoundSolution) {
|
||||
std::cout << "The following is a valid solution:" << std::endl;
|
||||
new_sudoku.printBoard();
|
||||
std::cout << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: After creating tasks for this level we will need to synchronize
|
||||
sudoku->set(y, x, 0); // no solution found, reset field
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/** @brief program entry point
|
||||
*/
|
||||
int main(int argc, char* argv[]) {
|
||||
// measure the time
|
||||
double t3, t4;
|
||||
|
||||
// expect three command line arguments: field size, block size, and input file
|
||||
if (argc != 4) {
|
||||
std::cout << "Usage: sudoku.exe <field size> <block size> <input filename>" << std::endl;
|
||||
std::cout << std::endl;
|
||||
return -1;
|
||||
}
|
||||
else {
|
||||
CSudokuBoard *sudoku1 = new CSudokuBoard(atoi(argv[1]), atoi(argv[2]));
|
||||
if (!sudoku1->loadFromFile(argv[3])) {
|
||||
std::cout << "There was an error reading a Sudoku template from " << argv[3] << std::endl;
|
||||
std::cout << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
// print the Sudoku board template
|
||||
std::cout << "Given Sudoku template" << std::endl;
|
||||
sudoku1->printBoard();
|
||||
|
||||
// solve the Sudoku by finding (and printing) all solutions
|
||||
t3 = omp_get_wtime();
|
||||
// TODO: Sudoku solver needs to execute within a parallel region
|
||||
{
|
||||
solve_parallel(0, 0, sudoku1, false);
|
||||
}
|
||||
t4 = omp_get_wtime();
|
||||
|
||||
std::cout << std::endl;
|
||||
std::cout << "In total there were " << found_sudokus << " solutions found in parallel " << std::endl;
|
||||
std::cout << std::endl;
|
||||
|
||||
delete sudoku1;
|
||||
}
|
||||
|
||||
// print the time
|
||||
std::cout << "Parallel computation took " << t4 - t3 << " seconds ("
|
||||
<< omp_get_max_threads() << " threads)." << std::endl << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
14
02-dependencies/Makefile
Normal file
14
02-dependencies/Makefile
Normal file
|
@ -0,0 +1,14 @@
|
|||
ALL_RECURSIVE=all-recursive clean-recursive
|
||||
|
||||
DIRS=$(shell ls -d *-* | sed '/_/d' 2>/dev/null)
|
||||
|
||||
all: $(CONFIG) all-recursive
|
||||
clean: $(CONFIG) clean-recursive
|
||||
|
||||
$(ALL_RECURSIVE):
|
||||
@failcom='exit 1';\
|
||||
target=`echo $@ | sed s/-recursive//`; \
|
||||
for subdir in $(DIRS); do \
|
||||
(cd $$subdir && $(MAKE) $$target) || eval $$failcom; \
|
||||
done;
|
||||
|
0
02-dependencies/bin/.gitdir
Normal file
0
02-dependencies/bin/.gitdir
Normal file
29
02-dependencies/bin/run-cholesky.sh
Executable file
29
02-dependencies/bin/run-cholesky.sh
Executable file
|
@ -0,0 +1,29 @@
|
|||
#!/bin/bash
|
||||
#SBATCH --job-name=cholesky
|
||||
#SBATCH --workdir=.
|
||||
#SBATCH --output=cholesky_%j.out
|
||||
#SBATCH --error=cholesky_%j.err
|
||||
#SBATCH --cpus-per-task=48
|
||||
#SBATCH --ntasks=1
|
||||
#SBATCH --time=00:30:00
|
||||
#SBATCH --qos=debug
|
||||
|
||||
export IFS=";"
|
||||
|
||||
THREADS="01;02;04;06;08;10;12;14;16;18;20;22;24;26;28;30;32;34;36;38;40;42;44;46;48"
|
||||
#THREADS="01;02;04;06;08;10;12"
|
||||
|
||||
#MSIZES="2048;4096;8192"
|
||||
MSIZES="8192"
|
||||
MSIZES="16384"
|
||||
#MSIZES="2048"
|
||||
|
||||
BSIZES="256"
|
||||
|
||||
for MS in $MSIZES; do
|
||||
for BS in $BSIZES; do
|
||||
for threads in $THREADS; do
|
||||
OMP_NUM_THREADS=$threads ./cholesky-tsk-dep $MS $BS 0
|
||||
done
|
||||
done
|
||||
done
|
23
02-dependencies/cholesky-tsk-dep/Makefile
Normal file
23
02-dependencies/cholesky-tsk-dep/Makefile
Normal file
|
@ -0,0 +1,23 @@
|
|||
BIN_DIR=../bin
|
||||
PROGRAM=$(BIN_DIR)/cholesky-tsk-dep
|
||||
|
||||
MKL_HOME=$(MKLROOT)
|
||||
|
||||
CC=gcc
|
||||
CFLAGS=-O3 -std=gnu99 -I$(MKL_HOME)/include -fopenmp
|
||||
LDFLAGS=-L$(MKL_HOME)/lib/intel64 -lmkl_sequential -lmkl_core -lmkl_rt -lpthread -lm -fopenmp
|
||||
#VERBOSE=-DVERBOSE
|
||||
|
||||
all: $(PROGRAM)
|
||||
|
||||
$(PROGRAM): cholesky.c
|
||||
$(CC) $(CFLAGS) $(VERBOSE) -o $@ $^ $(LDFLAGS)
|
||||
|
||||
$(BIN_DIR):
|
||||
mkdir $@
|
||||
|
||||
clean:
|
||||
rm -rf $(PROGRAM) *.o
|
||||
|
||||
wipe: clean
|
||||
rm -rf *.out *.err
|
296
02-dependencies/cholesky-tsk-dep/cholesky-solved.c
Normal file
296
02-dependencies/cholesky-tsk-dep/cholesky-solved.c
Normal file
|
@ -0,0 +1,296 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <errno.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include <sys/time.h>
|
||||
#include <sys/times.h>
|
||||
|
||||
#include <math.h>
|
||||
#include <mkl.h>
|
||||
|
||||
#include "omp.h"
|
||||
|
||||
#if !defined(_OPENMP)
|
||||
int omp_get_max_threads() { return 1; }
|
||||
int omp_get_num_threads() { return 1; }
|
||||
#endif
|
||||
|
||||
void cholesky(int ts, int nt, double* Ah[nt][nt])
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Computing Cholesky Factorization: indirect blocked matrix...\n");
|
||||
#endif
|
||||
|
||||
#pragma omp parallel
|
||||
#pragma omp single
|
||||
for (int k = 0; k < nt; k++) {
|
||||
// Diagonal Block factorization: using LAPACK
|
||||
#pragma omp task depend(inout: Ah[k][k])
|
||||
LAPACKE_dpotrf(LAPACK_COL_MAJOR, 'L', ts, Ah[k][k], ts);
|
||||
|
||||
// Triangular systems
|
||||
for (int i = k + 1; i < nt; i++) {
|
||||
#pragma omp task depend(in: Ah[k][k]) depend(inout: Ah[k][i])
|
||||
cblas_dtrsm(CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, ts, ts, 1.0, Ah[k][k], ts, Ah[k][i], ts);
|
||||
}
|
||||
|
||||
// Update trailing matrix
|
||||
for (int i = k + 1; i < nt; i++) {
|
||||
for (int j = k + 1; j < i; j++) {
|
||||
#pragma omp task depend(in: Ah[k][i], Ah[k][j]) depend(inout: Ah[j][i])
|
||||
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, ts, ts, ts, -1.0, Ah[k][i], ts, Ah[k][j], ts, 1.0, Ah[j][i], ts);
|
||||
}
|
||||
#pragma omp task depend(in: Ah[k][i]) depend(inout: Ah[i][i])
|
||||
cblas_dsyrk(CblasColMajor, CblasLower, CblasNoTrans, ts, ts, -1.0, Ah[k][i], ts, 1.0, Ah[i][i], ts);
|
||||
}
|
||||
}
|
||||
|
||||
#pragma omp taskwait
|
||||
#ifdef VERBOSE
|
||||
printf("> ...end of Cholesky Factorization.\n");
|
||||
#endif
|
||||
}
|
||||
|
||||
float get_time()
|
||||
{
|
||||
static double gtod_ref_time_sec = 0.0;
|
||||
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, NULL);
|
||||
|
||||
// If this is the first invocation of through dclock(), then initialize the
|
||||
// "reference time" global variable to the seconds field of the tv struct.
|
||||
if (gtod_ref_time_sec == 0.0) gtod_ref_time_sec = (double) tv.tv_sec;
|
||||
|
||||
// Normalize the seconds field of the tv struct so that it is relative to the
|
||||
// "reference time" that was recorded during the first invocation of dclock().
|
||||
const double norm_sec = (double) tv.tv_sec - gtod_ref_time_sec;
|
||||
|
||||
// Compute the number of seconds since the reference time.
|
||||
const double t = norm_sec + tv.tv_usec * 1.0e-6;
|
||||
|
||||
return (float) t;
|
||||
}
|
||||
|
||||
// Robust Check the factorization of the matrix A2
|
||||
// Using directly Fortran services: dlacpy_, dtrmm, dlange_
|
||||
static int check_factorization(int N, double *A1, double *A2, int LDA, char uplo, double eps)
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Checking the Cholesky Factorization... \n");
|
||||
#endif
|
||||
char NORM = 'I', ALL = 'A', UP = 'U', LO = 'L', TR = 'T', NU = 'N', RI = 'R';
|
||||
|
||||
double *Residual = (double *) malloc(N*N*sizeof(double));
|
||||
double *L1 = (double *) malloc(N*N*sizeof(double));
|
||||
double *L2 = (double *) malloc(N*N*sizeof(double));
|
||||
double *work = (double *) malloc(N*sizeof(double));
|
||||
|
||||
memset((void*)L1, 0, N*N*sizeof(double));
|
||||
memset((void*)L2, 0, N*N*sizeof(double));
|
||||
|
||||
double alpha= 1.0;
|
||||
|
||||
dlacpy_(&ALL, &N, &N, A1, &LDA, Residual, &N);
|
||||
|
||||
/* Dealing with L'L or U'U */
|
||||
if (uplo == 'U'){
|
||||
dlacpy_(&UP, &N, &N, A2, &LDA, L1, &N);
|
||||
dlacpy_(&UP, &N, &N, A2, &LDA, L2, &N);
|
||||
dtrmm(&LO, &uplo, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
|
||||
} else{
|
||||
dlacpy_(&LO, &N, &N, A2, &LDA, L1, &N);
|
||||
dlacpy_(&LO, &N, &N, A2, &LDA, L2, &N);
|
||||
dtrmm(&RI, &LO, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
|
||||
}
|
||||
|
||||
/* Compute the Residual || A -L'L|| */
|
||||
for (int i = 0; i < N; i++)
|
||||
for (int j = 0; j < N; j++)
|
||||
Residual[j*N+i] = L2[j*N+i] - Residual[j*N+i];
|
||||
|
||||
double Rnorm = dlange_(&NORM, &N, &N, Residual, &N, work);
|
||||
double Anorm = dlange_(&NORM, &N, &N, A1, &N, work);
|
||||
|
||||
#ifdef VERBOSE
|
||||
printf("> - ||L'L-A||_oo/(||A||_oo.N.eps) = %e \n",Rnorm / (Anorm*N*eps));
|
||||
#endif
|
||||
|
||||
const int info_factorization = isnan(Rnorm/(Anorm*N*eps)) || isinf(Rnorm/(Anorm*N*eps)) || (Rnorm/(Anorm*N*eps) > 60.0);
|
||||
|
||||
#ifdef VERBOSE
|
||||
if ( info_factorization) printf("> - Factorization is suspicious!\n");
|
||||
else printf("> - Factorization is CORRECT!\n");
|
||||
#endif
|
||||
|
||||
free(Residual); free(L1); free(L2); free(work);
|
||||
|
||||
return info_factorization;
|
||||
}
|
||||
|
||||
void initialize_matrix(const int n, const int ts, double *matrix)
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Initializing matrix with random values...\n");
|
||||
#endif
|
||||
|
||||
int ISEED[4] = {0,0,0,1};
|
||||
int intONE=1;
|
||||
|
||||
for (int i = 0; i < n*n; i+=n) {
|
||||
dlarnv_(&intONE, &ISEED[0], &n, &matrix[i]);
|
||||
}
|
||||
|
||||
for (int i=0; i<n; i++) {
|
||||
for (int j=0; j<n; j++) {
|
||||
matrix[j*n+i] = matrix[j*n+i] + matrix[i*n+j];
|
||||
matrix[i*n+j] = matrix[j*n+i];
|
||||
}
|
||||
}
|
||||
|
||||
// Diagonal values
|
||||
for (int i = 0; i < n; i++) {
|
||||
matrix[i*n+i] += (double) n;
|
||||
}
|
||||
}
|
||||
|
||||
void gather_block(const int N, const int ts, double *Alin, double *A)
|
||||
{
|
||||
for (int i = 0; i < ts; i++)
|
||||
for (int j = 0; j < ts; j++) {
|
||||
A[i*ts + j] = Alin[i*N + j];
|
||||
}
|
||||
}
|
||||
|
||||
void scatter_block(const int N, const int ts, double *A, double *Alin)
|
||||
{
|
||||
for (int i = 0; i < ts; i++)
|
||||
for (int j = 0; j < ts; j++) {
|
||||
Alin[i*N + j] = A[i*ts + j];
|
||||
}
|
||||
}
|
||||
|
||||
void convert_to_blocks(const int ts, const int DIM, const int N, double Alin[N][N], double *A[DIM][DIM])
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Converting linear matrix to blocks...\n");
|
||||
#endif
|
||||
for (int i = 0; i < DIM; i++)
|
||||
for (int j = 0; j < DIM; j++) {
|
||||
gather_block ( N, ts, &Alin[i*ts][j*ts], A[i][j]);
|
||||
}
|
||||
}
|
||||
|
||||
void convert_to_linear(const int ts, const int DIM, const int N, double *A[DIM][DIM], double Alin[N][N])
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Converting blocked matrix to linear...\n");
|
||||
#endif
|
||||
for (int i = 0; i < DIM; i++)
|
||||
for (int j = 0; j < DIM; j++) {
|
||||
scatter_block ( N, ts, A[i][j], (double *) &Alin[i*ts][j*ts]);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
char *result[3] = {"n/a","pass","FAIL"};
|
||||
const double eps = pow(2.0, -53);
|
||||
|
||||
// If number of arguments is not 4, print help
|
||||
if ( argc != 4) {
|
||||
printf( "%s: matrix_size block_size check[0|1]?\n", argv[0] );
|
||||
exit( -1 );
|
||||
}
|
||||
|
||||
const int n = atoi(argv[1]); // matrix size
|
||||
const int ts = atoi(argv[2]); // tile size
|
||||
int check = atoi(argv[3]); // check result?
|
||||
|
||||
// Compute number of tiles
|
||||
const int nt = n / ts;
|
||||
assert((nt*ts) == n); // tile size should divide size
|
||||
|
||||
// Allocate matrix
|
||||
double * const matrix = (double *) malloc(n * n * sizeof(double));
|
||||
assert(matrix != NULL);
|
||||
|
||||
// Initialize matrix
|
||||
initialize_matrix(n, ts, matrix);
|
||||
|
||||
// Allocate original matrix, and duplicate it, for debugging purposes
|
||||
double * const original_matrix = (double *) malloc(n * n * sizeof(double));
|
||||
assert(original_matrix != NULL);
|
||||
|
||||
// Save a copy of the original matrix
|
||||
for (int i = 0; i < n * n; i++ ) {
|
||||
original_matrix[i] = matrix[i];
|
||||
}
|
||||
|
||||
// Set version description: Indirect blocked matrix
|
||||
const char *version = "I-Blocked matrix";
|
||||
|
||||
// Allocate blocked matrix
|
||||
double *Ah[nt][nt];
|
||||
for (int i = 0; i < nt; i++) {
|
||||
for (int j = 0; j < nt; j++) {
|
||||
Ah[i][j] = malloc(ts * ts * sizeof(double));
|
||||
assert(Ah[i][j] != NULL);
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------
|
||||
// Convert, compute (time), and re-convert
|
||||
// ---------------------------------------
|
||||
convert_to_blocks(ts, nt, n, (double(*)[n]) matrix, Ah);
|
||||
const float tref = get_time();
|
||||
cholesky(ts, nt, (double* (*)[nt]) Ah);
|
||||
const float time = get_time() - tref;
|
||||
convert_to_linear(ts, nt, n, Ah, (double (*)[n]) matrix);
|
||||
|
||||
// Free blocked matrix
|
||||
for (int i = 0; i < nt; i++) {
|
||||
for (int j = 0; j < nt; j++) {
|
||||
assert(Ah[i][j] != NULL);
|
||||
free(Ah[i][j]);
|
||||
}
|
||||
}
|
||||
|
||||
// Check result, if requested
|
||||
if ( check ) {
|
||||
const char uplo = 'L';
|
||||
if ( check_factorization( n, original_matrix, matrix, n, uplo, eps) ) check++;
|
||||
}
|
||||
|
||||
// Free original matrix, not needed anymore
|
||||
free(original_matrix);
|
||||
|
||||
// Compute GFLOPs (Not verified)
|
||||
float gflops = (((1.0 / 3.0) * n * n * n) / ((time) * 1.0e+9));
|
||||
|
||||
// Print results
|
||||
#ifdef VERBOSE
|
||||
printf( "\n" );
|
||||
printf( "============ CHOLESKY RESULTS ============\n" );
|
||||
printf( " test %s\n", argv[0]);
|
||||
printf( " version %s\n", version);
|
||||
printf( " matrix size: %dx%d\n", n, n);
|
||||
printf( " tile size: %dx%d\n", ts, ts);
|
||||
printf( " number of threads: %d\n", omp_get_max_threads());
|
||||
printf( " time (s): %f\n", time);
|
||||
printf( " performance (gflops): %f\n", gflops);
|
||||
printf( " check: %s\n", result[check]);
|
||||
printf( "==========================================\n" );
|
||||
#else
|
||||
printf("test, %s, version, %s, n, %d, ts, %d, num_threads, %d, gflops, %f, time, %f, check, %s\n", argv[0], version, n, ts, omp_get_max_threads(), gflops, time, result[check]);
|
||||
#endif
|
||||
|
||||
// Free matrix
|
||||
free(matrix);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
293
02-dependencies/cholesky-tsk-dep/cholesky.c
Normal file
293
02-dependencies/cholesky-tsk-dep/cholesky.c
Normal file
|
@ -0,0 +1,293 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <errno.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include <sys/time.h>
|
||||
#include <sys/times.h>
|
||||
|
||||
#include <math.h>
|
||||
#include <mkl.h>
|
||||
|
||||
#include "omp.h"
|
||||
|
||||
#if !defined(_OPENMP)
|
||||
int omp_get_max_threads() { return 1; }
|
||||
int omp_get_num_threads() { return 1; }
|
||||
#endif
|
||||
|
||||
void cholesky(int ts, int nt, double* Ah[nt][nt])
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Computing Cholesky Factorization: indirect blocked matrix...\n");
|
||||
#endif
|
||||
|
||||
// TODO: Parallel region is required in order to run in parallel.
|
||||
// TODO: Kernel invocations could be task candidates.
|
||||
// TODO: Add scheduler restrictions between tasks using dependences.
|
||||
for (int k = 0; k < nt; k++) {
|
||||
// Diagonal Block factorization: using LAPACK
|
||||
LAPACKE_dpotrf(LAPACK_COL_MAJOR, 'L', ts, Ah[k][k], ts);
|
||||
|
||||
// Triangular systems
|
||||
for (int i = k + 1; i < nt; i++) {
|
||||
cblas_dtrsm(CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, ts, ts, 1.0, Ah[k][k], ts, Ah[k][i], ts);
|
||||
}
|
||||
|
||||
// Update trailing matrix
|
||||
for (int i = k + 1; i < nt; i++) {
|
||||
for (int j = k + 1; j < i; j++) {
|
||||
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, ts, ts, ts, -1.0, Ah[k][i], ts, Ah[k][j], ts, 1.0, Ah[j][i], ts);
|
||||
}
|
||||
cblas_dsyrk(CblasColMajor, CblasLower, CblasNoTrans, ts, ts, -1.0, Ah[k][i], ts, 1.0, Ah[i][i], ts);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: we need to guarantee all the work has finished at this point
|
||||
#ifdef VERBOSE
|
||||
printf("> ...end of Cholesky Factorization.\n");
|
||||
#endif
|
||||
}
|
||||
|
||||
float get_time()
|
||||
{
|
||||
static double gtod_ref_time_sec = 0.0;
|
||||
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, NULL);
|
||||
|
||||
// If this is the first invocation of through dclock(), then initialize the
|
||||
// "reference time" global variable to the seconds field of the tv struct.
|
||||
if (gtod_ref_time_sec == 0.0) gtod_ref_time_sec = (double) tv.tv_sec;
|
||||
|
||||
// Normalize the seconds field of the tv struct so that it is relative to the
|
||||
// "reference time" that was recorded during the first invocation of dclock().
|
||||
const double norm_sec = (double) tv.tv_sec - gtod_ref_time_sec;
|
||||
|
||||
// Compute the number of seconds since the reference time.
|
||||
const double t = norm_sec + tv.tv_usec * 1.0e-6;
|
||||
|
||||
return (float) t;
|
||||
}
|
||||
|
||||
// Robust Check the factorization of the matrix A2
|
||||
// Using directly Fortran services: dlacpy_, dtrmm, dlange_
|
||||
static int check_factorization(int N, double *A1, double *A2, int LDA, char uplo, double eps)
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Checking the Cholesky Factorization... \n");
|
||||
#endif
|
||||
char NORM = 'I', ALL = 'A', UP = 'U', LO = 'L', TR = 'T', NU = 'N', RI = 'R';
|
||||
|
||||
double *Residual = (double *) malloc(N*N*sizeof(double));
|
||||
double *L1 = (double *) malloc(N*N*sizeof(double));
|
||||
double *L2 = (double *) malloc(N*N*sizeof(double));
|
||||
double *work = (double *) malloc(N*sizeof(double));
|
||||
|
||||
memset((void*)L1, 0, N*N*sizeof(double));
|
||||
memset((void*)L2, 0, N*N*sizeof(double));
|
||||
|
||||
double alpha= 1.0;
|
||||
|
||||
dlacpy_(&ALL, &N, &N, A1, &LDA, Residual, &N);
|
||||
|
||||
/* Dealing with L'L or U'U */
|
||||
if (uplo == 'U'){
|
||||
dlacpy_(&UP, &N, &N, A2, &LDA, L1, &N);
|
||||
dlacpy_(&UP, &N, &N, A2, &LDA, L2, &N);
|
||||
dtrmm(&LO, &uplo, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
|
||||
} else{
|
||||
dlacpy_(&LO, &N, &N, A2, &LDA, L1, &N);
|
||||
dlacpy_(&LO, &N, &N, A2, &LDA, L2, &N);
|
||||
dtrmm(&RI, &LO, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
|
||||
}
|
||||
|
||||
/* Compute the Residual || A -L'L|| */
|
||||
for (int i = 0; i < N; i++)
|
||||
for (int j = 0; j < N; j++)
|
||||
Residual[j*N+i] = L2[j*N+i] - Residual[j*N+i];
|
||||
|
||||
double Rnorm = dlange_(&NORM, &N, &N, Residual, &N, work);
|
||||
double Anorm = dlange_(&NORM, &N, &N, A1, &N, work);
|
||||
|
||||
#ifdef VERBOSE
|
||||
printf("> - ||L'L-A||_oo/(||A||_oo.N.eps) = %e \n",Rnorm / (Anorm*N*eps));
|
||||
#endif
|
||||
|
||||
const int info_factorization = isnan(Rnorm/(Anorm*N*eps)) || isinf(Rnorm/(Anorm*N*eps)) || (Rnorm/(Anorm*N*eps) > 60.0);
|
||||
|
||||
#ifdef VERBOSE
|
||||
if ( info_factorization) printf("> - Factorization is suspicious!\n");
|
||||
else printf("> - Factorization is CORRECT!\n");
|
||||
#endif
|
||||
|
||||
free(Residual); free(L1); free(L2); free(work);
|
||||
|
||||
return info_factorization;
|
||||
}
|
||||
|
||||
void initialize_matrix(const int n, const int ts, double *matrix)
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Initializing matrix with random values...\n");
|
||||
#endif
|
||||
|
||||
int ISEED[4] = {0,0,0,1};
|
||||
int intONE=1;
|
||||
|
||||
for (int i = 0; i < n*n; i+=n) {
|
||||
dlarnv_(&intONE, &ISEED[0], &n, &matrix[i]);
|
||||
}
|
||||
|
||||
for (int i=0; i<n; i++) {
|
||||
for (int j=0; j<n; j++) {
|
||||
matrix[j*n+i] = matrix[j*n+i] + matrix[i*n+j];
|
||||
matrix[i*n+j] = matrix[j*n+i];
|
||||
}
|
||||
}
|
||||
|
||||
// Diagonal values
|
||||
for (int i = 0; i < n; i++) {
|
||||
matrix[i*n+i] += (double) n;
|
||||
}
|
||||
}
|
||||
|
||||
void gather_block(const int N, const int ts, double *Alin, double *A)
|
||||
{
|
||||
for (int i = 0; i < ts; i++)
|
||||
for (int j = 0; j < ts; j++) {
|
||||
A[i*ts + j] = Alin[i*N + j];
|
||||
}
|
||||
}
|
||||
|
||||
void scatter_block(const int N, const int ts, double *A, double *Alin)
|
||||
{
|
||||
for (int i = 0; i < ts; i++)
|
||||
for (int j = 0; j < ts; j++) {
|
||||
Alin[i*N + j] = A[i*ts + j];
|
||||
}
|
||||
}
|
||||
|
||||
void convert_to_blocks(const int ts, const int DIM, const int N, double Alin[N][N], double *A[DIM][DIM])
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Converting linear matrix to blocks...\n");
|
||||
#endif
|
||||
for (int i = 0; i < DIM; i++)
|
||||
for (int j = 0; j < DIM; j++) {
|
||||
gather_block ( N, ts, &Alin[i*ts][j*ts], A[i][j]);
|
||||
}
|
||||
}
|
||||
|
||||
void convert_to_linear(const int ts, const int DIM, const int N, double *A[DIM][DIM], double Alin[N][N])
|
||||
{
|
||||
#ifdef VERBOSE
|
||||
printf("> Converting blocked matrix to linear...\n");
|
||||
#endif
|
||||
for (int i = 0; i < DIM; i++)
|
||||
for (int j = 0; j < DIM; j++) {
|
||||
scatter_block ( N, ts, A[i][j], (double *) &Alin[i*ts][j*ts]);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
char *result[3] = {"n/a","pass","FAIL"};
|
||||
const double eps = pow(2.0, -53);
|
||||
|
||||
// If number of arguments is not 4, print help
|
||||
if ( argc != 4) {
|
||||
printf( "%s: matrix_size block_size check[0|1]?\n", argv[0] );
|
||||
exit( -1 );
|
||||
}
|
||||
|
||||
const int n = atoi(argv[1]); // matrix size
|
||||
const int ts = atoi(argv[2]); // tile size
|
||||
int check = atoi(argv[3]); // check result?
|
||||
|
||||
// Compute number of tiles
|
||||
const int nt = n / ts;
|
||||
assert((nt*ts) == n); // tile size should divide size
|
||||
|
||||
// Allocate matrix
|
||||
double * const matrix = (double *) malloc(n * n * sizeof(double));
|
||||
assert(matrix != NULL);
|
||||
|
||||
// Initialize matrix
|
||||
initialize_matrix(n, ts, matrix);
|
||||
|
||||
// Allocate original matrix, and duplicate it, for debugging purposes
|
||||
double * const original_matrix = (double *) malloc(n * n * sizeof(double));
|
||||
assert(original_matrix != NULL);
|
||||
|
||||
// Save a copy of the original matrix
|
||||
for (int i = 0; i < n * n; i++ ) {
|
||||
original_matrix[i] = matrix[i];
|
||||
}
|
||||
|
||||
// Set version description: Indirect blocked matrix
|
||||
const char *version = "I-Blocked matrix";
|
||||
|
||||
// Allocate blocked matrix
|
||||
double *Ah[nt][nt];
|
||||
for (int i = 0; i < nt; i++) {
|
||||
for (int j = 0; j < nt; j++) {
|
||||
Ah[i][j] = malloc(ts * ts * sizeof(double));
|
||||
assert(Ah[i][j] != NULL);
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------
|
||||
// Convert, compute (time), and re-convert
|
||||
// ---------------------------------------
|
||||
convert_to_blocks(ts, nt, n, (double(*)[n]) matrix, Ah);
|
||||
const float tref = get_time();
|
||||
cholesky(ts, nt, (double* (*)[nt]) Ah);
|
||||
const float time = get_time() - tref;
|
||||
convert_to_linear(ts, nt, n, Ah, (double (*)[n]) matrix);
|
||||
|
||||
// Free blocked matrix
|
||||
for (int i = 0; i < nt; i++) {
|
||||
for (int j = 0; j < nt; j++) {
|
||||
assert(Ah[i][j] != NULL);
|
||||
free(Ah[i][j]);
|
||||
}
|
||||
}
|
||||
|
||||
// Check result, if requested
|
||||
if ( check ) {
|
||||
const char uplo = 'L';
|
||||
if ( check_factorization( n, original_matrix, matrix, n, uplo, eps) ) check++;
|
||||
}
|
||||
|
||||
// Free original matrix, not needed anymore
|
||||
free(original_matrix);
|
||||
|
||||
// Compute GFLOPs (Not verified)
|
||||
float gflops = (((1.0 / 3.0) * n * n * n) / ((time) * 1.0e+9));
|
||||
|
||||
// Print results
|
||||
#ifdef VERBOSE
|
||||
printf( "\n" );
|
||||
printf( "============ CHOLESKY RESULTS ============\n" );
|
||||
printf( " test %s\n", argv[0]);
|
||||
printf( " version %s\n", version);
|
||||
printf( " matrix size: %dx%d\n", n, n);
|
||||
printf( " tile size: %dx%d\n", ts, ts);
|
||||
printf( " number of threads: %d\n", omp_get_max_threads());
|
||||
printf( " time (s): %f\n", time);
|
||||
printf( " performance (gflops): %f\n", gflops);
|
||||
printf( " check: %s\n", result[check]);
|
||||
printf( "==========================================\n" );
|
||||
#else
|
||||
printf("test, %s, version, %s, n, %d, ts, %d, num_threads, %d, gflops, %f, time, %f, check, %s\n", argv[0], version, n, ts, omp_get_max_threads(), gflops, time, result[check]);
|
||||
#endif
|
||||
|
||||
// Free matrix
|
||||
free(matrix);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
14
03-cut-offs/Makefile
Normal file
14
03-cut-offs/Makefile
Normal file
|
@ -0,0 +1,14 @@
|
|||
ALL_RECURSIVE=all-recursive clean-recursive
|
||||
|
||||
DIRS=$(shell ls -d */ | sed '/_/d' 2>/dev/null | sed '/bin/d' 2>/dev/null)
|
||||
|
||||
all: all-recursive
|
||||
clean: clean-recursive
|
||||
|
||||
$(ALL_RECURSIVE):
|
||||
@failcom='exit 1';\
|
||||
target=`echo $@ | sed s/-recursive//`; \
|
||||
for subdir in $(DIRS); do \
|
||||
(cd $$subdir && $(MAKE) $$target) || eval $$failcom; \
|
||||
done;
|
||||
|
0
03-cut-offs/bin/.gitdir
Normal file
0
03-cut-offs/bin/.gitdir
Normal file
24
03-cut-offs/bin/run-mergesort.sh
Executable file
24
03-cut-offs/bin/run-mergesort.sh
Executable file
|
@ -0,0 +1,24 @@
|
|||
#!/bin/bash
|
||||
#SBATCH --job-name=mergesort
|
||||
#SBATCH --workdir=.
|
||||
#SBATCH --output=mergesort_%j.out
|
||||
#SBATCH --error=mergesort_%j.err
|
||||
#SBATCH --cpus-per-task=48
|
||||
#SBATCH --ntasks=1
|
||||
#SBATCH --time=00:30:00
|
||||
#SBATCH --qos=debug
|
||||
|
||||
export IFS=";"
|
||||
|
||||
THREADS="01;02;04;06;08;10;12;14;16;18;20;22;24;26;28;30;32;34;36;38;40;42;44;46;48"
|
||||
#THREADS="01;02;04;06;08;10;12"
|
||||
|
||||
ASIZES="1000000"
|
||||
|
||||
for AS in $ASIZES; do
|
||||
for threads in $THREADS; do
|
||||
echo Running mergesort-co with $threads
|
||||
OMP_NUM_THREADS=$threads ./mergesort-co $AS
|
||||
echo --
|
||||
done
|
||||
done
|
20
03-cut-offs/bin/run-sudoku.sh
Executable file
20
03-cut-offs/bin/run-sudoku.sh
Executable file
|
@ -0,0 +1,20 @@
|
|||
#!/bin/bash
|
||||
#SBATCH --job-name=sudoku
|
||||
#SBATCH --workdir=.
|
||||
#SBATCH --output=sudoku_%j.out
|
||||
#SBATCH --error=sudoku_%j.err
|
||||
#SBATCH --cpus-per-task=48
|
||||
#SBATCH --ntasks=1
|
||||
#SBATCH --time=00:30:00
|
||||
#SBATCH --qos=debug
|
||||
|
||||
export IFS=";"
|
||||
|
||||
THREADS="01;02;04;06;08;10;12;14;16;18;20;22;24;26;28;30;32;34;36;38;40;42;44;46;48"
|
||||
#THREADS="01;02;04;06;08;10;12"
|
||||
|
||||
for threads in $THREADS; do
|
||||
#OMP_NUM_THREADS=$threads ./sudoku-co 9 3 sudoku-9x9-1.txt
|
||||
OMP_NUM_THREADS=$threads ./sudoku-co 16 4 sudoku-16x16-1.txt
|
||||
#OMP_NUM_THREADS=$threads ./sudoku-co 25 5 sudoku-25x25-1.txt
|
||||
done
|
16
03-cut-offs/bin/sudoku-16x16-1.txt
Normal file
16
03-cut-offs/bin/sudoku-16x16-1.txt
Normal file
|
@ -0,0 +1,16 @@
|
|||
0 6 0 0 0 0 0 8 11 0 0 15 14 0 0 16
|
||||
15 11 0 0 0 16 14 0 0 0 12 0 0 6 0 0
|
||||
13 0 9 12 0 0 0 0 3 16 14 0 15 11 10 0
|
||||
2 0 16 0 11 0 15 10 1 0 0 0 0 0 0 0
|
||||
0 15 11 10 0 0 16 2 13 8 9 12 0 0 0 0
|
||||
12 13 0 0 4 1 5 6 2 3 0 0 0 0 11 10
|
||||
5 0 6 1 12 0 9 0 15 11 10 7 16 0 0 3
|
||||
0 2 0 0 0 10 0 11 6 0 5 0 0 13 0 9
|
||||
10 7 15 11 16 0 0 0 12 13 0 0 0 0 0 6
|
||||
9 0 0 0 0 0 1 0 0 2 0 16 10 0 0 11
|
||||
1 0 4 6 9 13 0 0 7 0 11 0 3 16 0 0
|
||||
16 14 0 0 7 0 10 15 4 6 1 0 0 0 13 8
|
||||
11 10 0 15 0 0 0 16 9 12 13 0 0 1 5 4
|
||||
0 0 12 0 1 4 6 0 16 0 0 0 11 10 0 0
|
||||
0 0 5 0 8 12 13 0 10 0 0 11 2 0 0 14
|
||||
3 16 0 0 10 0 0 7 0 0 6 0 0 0 12 0
|
25
03-cut-offs/bin/sudoku-25x25-1.txt
Normal file
25
03-cut-offs/bin/sudoku-25x25-1.txt
Normal file
|
@ -0,0 +1,25 @@
|
|||
0 8 0 18 0 12 20 21 16 0 0 0 0 0 0 15 0 0 24 5 7 0 19 0 0
|
||||
0 0 15 9 5 4 0 23 0 18 12 20 0 21 16 7 0 0 0 2 1 0 0 0 0
|
||||
0 0 0 0 14 7 0 0 0 0 0 24 0 0 0 0 10 0 0 16 0 0 23 18 6
|
||||
21 20 12 0 16 0 0 25 14 22 7 0 0 19 2 4 0 23 8 6 0 24 0 0 0
|
||||
19 0 7 0 2 15 0 0 0 9 0 0 18 0 0 0 0 0 0 0 12 0 21 0 16
|
||||
0 23 6 0 0 16 0 0 0 20 14 25 11 0 0 5 24 7 3 9 2 19 1 0 17
|
||||
4 21 0 20 0 0 0 12 22 11 2 19 0 0 0 0 0 15 23 0 0 3 7 0 9
|
||||
12 25 14 0 22 0 19 0 0 13 0 0 24 7 0 0 0 0 21 10 6 0 15 8 18
|
||||
7 0 0 24 9 6 0 15 18 0 0 0 0 0 10 0 13 1 19 0 0 25 0 11 22
|
||||
0 19 2 13 17 5 0 0 9 0 6 0 8 15 0 14 11 12 25 22 16 0 0 0 10
|
||||
0 0 0 1 0 0 0 0 3 7 8 0 15 0 0 11 12 10 0 0 0 0 18 4 0
|
||||
10 16 11 12 25 0 14 22 19 1 0 0 0 17 3 20 0 0 6 21 8 0 0 0 0
|
||||
0 0 8 0 23 20 0 18 21 4 11 16 0 0 25 0 7 0 2 0 0 14 22 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 22 19 8 15 0 5 23 24 0 17 0 3
|
||||
0 2 0 7 3 8 0 9 23 15 20 0 4 0 21 13 1 0 14 19 0 0 10 12 25
|
||||
2 0 0 0 0 18 15 5 0 23 0 0 21 6 20 17 0 14 1 13 0 12 0 25 11
|
||||
0 1 0 19 13 9 7 0 24 0 0 0 23 0 8 0 25 0 0 11 0 0 0 21 0
|
||||
0 0 10 0 0 0 0 0 0 0 17 1 0 14 0 18 0 5 15 8 9 0 0 0 0
|
||||
16 12 0 0 11 0 0 14 0 0 0 0 3 2 24 0 21 0 4 20 0 15 0 0 8
|
||||
0 0 0 0 0 10 0 0 0 0 0 12 0 0 11 9 3 0 0 0 17 0 14 19 13
|
||||
24 0 0 0 15 21 18 0 4 0 0 0 16 20 0 3 2 13 0 7 19 22 0 0 1
|
||||
0 18 0 0 0 0 10 0 0 16 0 22 0 0 0 23 0 24 9 0 3 17 0 0 0
|
||||
0 0 0 0 0 0 0 0 15 5 0 0 6 8 4 0 14 11 22 1 25 0 20 16 12
|
||||
0 22 19 14 1 3 0 0 0 0 23 9 5 0 15 25 0 20 10 12 21 0 8 0 4
|
||||
20 0 25 16 0 0 22 11 1 0 3 0 2 13 7 21 0 0 0 0 0 9 0 0 0
|
10
03-cut-offs/bin/sudoku-9x9-1.txt
Normal file
10
03-cut-offs/bin/sudoku-9x9-1.txt
Normal file
|
@ -0,0 +1,10 @@
|
|||
5 3 0 0 7 0 0 0 0
|
||||
6 0 0 1 9 5 3 0 0
|
||||
0 9 8 0 0 0 0 6 0
|
||||
8 0 0 0 6 0 0 0 3
|
||||
4 0 0 8 0 3 0 0 1
|
||||
7 0 0 0 2 0 0 0 6
|
||||
0 6 0 0 0 0 2 8 0
|
||||
0 0 0 4 1 9 0 0 5
|
||||
0 0 0 0 8 0 0 7 9
|
||||
|
20
03-cut-offs/mergesort-co/Makefile
Normal file
20
03-cut-offs/mergesort-co/Makefile
Normal file
|
@ -0,0 +1,20 @@
|
|||
BIN_DIR=../bin
|
||||
PROGRAM=$(BIN_DIR)/mergesort-co
|
||||
|
||||
CC=g++
|
||||
CFLAGS=-O3 -fopenmp
|
||||
LDFLAGS=-fopenmp
|
||||
|
||||
all: $(PROGRAM)
|
||||
|
||||
$(PROGRAM): mergesort.cpp
|
||||
$(CC) $(CFLAGS) $(VERBOSE) -o $@ $^ $(LDFLAGS)
|
||||
|
||||
$(BIN_DIR):
|
||||
mkdir $@
|
||||
|
||||
clean:
|
||||
rm -rf $(PROGRAM) *.o
|
||||
|
||||
wipe: clean
|
||||
rm -rf *.out *.err
|
272
03-cut-offs/mergesort-co/mergesort-solved.cpp
Normal file
272
03-cut-offs/mergesort-co/mergesort-solved.cpp
Normal file
|
@ -0,0 +1,272 @@
|
|||
/*
|
||||
* This file is part of Christian's OpenMP software lab
|
||||
*
|
||||
* Copyright (C) 2016 by Christian Terboven <terboven@itc.rwth-aachen.de>
|
||||
* Copyright (C) 2016 by Jonas Hahnfeld <hahnfeld@itc.rwth-aachen.de>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <errno.h>
|
||||
#include <sys/time.h>
|
||||
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
|
||||
#include <cstdlib>
|
||||
#include <cstdio>
|
||||
|
||||
#include <cmath>
|
||||
#include <ctime>
|
||||
#include <cstring>
|
||||
|
||||
#include <omp.h>
|
||||
|
||||
|
||||
/**
|
||||
* helper routine: check if array is sorted correctly
|
||||
*/
|
||||
bool isSorted(int ref[], int data[], const size_t size){
|
||||
std::sort(ref, ref + size);
|
||||
for (size_t idx = 0; idx < size; ++idx){
|
||||
if (ref[idx] != data[idx]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* sequential merge step (straight-forward implementation)
|
||||
*/
|
||||
void MsMergeSequential(int *out, int *in, long begin1, long end1, long begin2, long end2, long outBegin) {
|
||||
long left = begin1;
|
||||
long right = begin2;
|
||||
|
||||
long idx = outBegin;
|
||||
|
||||
while (left < end1 && right < end2) {
|
||||
if (in[left] <= in[right]) {
|
||||
out[idx] = in[left];
|
||||
left++;
|
||||
} else {
|
||||
out[idx] = in[right];
|
||||
right++;
|
||||
}
|
||||
idx++;
|
||||
}
|
||||
|
||||
while (left < end1) {
|
||||
out[idx] = in[left];
|
||||
left++, idx++;
|
||||
}
|
||||
|
||||
while (right < end2) {
|
||||
out[idx] = in[right];
|
||||
right++, idx++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* sequential MergeSort
|
||||
*/
|
||||
void MsSequential(int *array, int *tmp, bool inplace, long begin, long end) {
|
||||
if (begin < (end - 1)) {
|
||||
const long half = (begin + end) / 2;
|
||||
MsSequential(array, tmp, !inplace, begin, half);
|
||||
MsSequential(array, tmp, !inplace, half, end);
|
||||
if (inplace) {
|
||||
MsMergeSequential(array, tmp, begin, half, half, end, begin);
|
||||
} else {
|
||||
MsMergeSequential(tmp, array, begin, half, half, end, begin);
|
||||
}
|
||||
} else if (!inplace) {
|
||||
tmp[begin] = array[begin];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* parallel merge step (straight-forward implementation)
|
||||
*/
|
||||
void MsMergeParallel(int *out, int *in, long begin1, long end1, long begin2, long end2, long outBegin, int deep) {
|
||||
if (deep) {
|
||||
long half1, half2, tmp, count, step;
|
||||
if ((end1 - begin1) < (end2 - begin2)) {
|
||||
half2 = (begin2 + end2) / 2;
|
||||
// find in[half2] in [begin1, end1) (std::upper_bound)
|
||||
half1 = begin1, count = (end1 - begin1);
|
||||
while (count > 0) {
|
||||
step = count / 2;
|
||||
tmp = half1 + step;
|
||||
if (in[tmp] <= in[half2]) {
|
||||
tmp++;
|
||||
half1 = tmp;
|
||||
count -= step + 1;
|
||||
} else {
|
||||
count = step;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
half1 = (begin1 + end1) / 2;
|
||||
// find in[half1] in [begin2, end2) (std::lower_bound)
|
||||
half2 = begin2, count = (end2 - begin2);
|
||||
while (count > 0) {
|
||||
step = count / 2;
|
||||
tmp = half2 + step;
|
||||
if (in[tmp] < in[half1]) {
|
||||
tmp++;
|
||||
half2 = tmp;
|
||||
count -= step + 1;
|
||||
} else {
|
||||
count = step;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#pragma omp task default(shared)
|
||||
{
|
||||
MsMergeParallel(out, in, begin1, half1, begin2, half2, outBegin, deep - 1);
|
||||
}
|
||||
|
||||
long outBegin2 = outBegin + (half1 - begin1) + (half2 - begin2);
|
||||
#pragma omp task default(shared)
|
||||
{
|
||||
MsMergeParallel(out, in, half1, end1, half2, end2, outBegin2, deep - 1);
|
||||
}
|
||||
|
||||
#pragma omp taskwait
|
||||
} else {
|
||||
MsMergeSequential(out, in, begin1, end1, begin2, end2, outBegin);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* OpenMP Task-parallel MergeSort
|
||||
*/
|
||||
void MsParallel(int *array, int *tmp, bool inplace, long begin, long end, int deep) {
|
||||
if (begin < (end - 1)) {
|
||||
long half = (begin + end) / 2;
|
||||
if (deep){
|
||||
#pragma omp task default(shared)
|
||||
{
|
||||
MsParallel(array, tmp, !inplace, begin, half, deep - 1);
|
||||
}
|
||||
|
||||
#pragma omp task default(shared)
|
||||
{
|
||||
MsParallel(array, tmp, !inplace, half, end, deep - 1);
|
||||
}
|
||||
|
||||
#pragma omp taskwait
|
||||
}
|
||||
else {
|
||||
MsSequential(array, tmp, !inplace, begin, half);
|
||||
MsSequential(array, tmp, !inplace, half, end);
|
||||
}
|
||||
|
||||
if (inplace) {
|
||||
MsMergeParallel(array, tmp, begin, half, half, end, begin, deep);
|
||||
} else {
|
||||
MsMergeParallel(tmp, array, begin, half, half, end, begin, deep);
|
||||
}
|
||||
} else if (!inplace) {
|
||||
tmp[begin] = array[begin];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* OpenMP Task-parallel MergeSort
|
||||
* startup routine containing the Parallel Region
|
||||
*/
|
||||
void MsParallelOmp(int *array, int *tmp, const size_t size) {
|
||||
|
||||
// compute cut-off recursion level
|
||||
const int iMinTask = (omp_get_max_threads() * 5);
|
||||
int deep = 0;
|
||||
while ((1 << deep) < iMinTask) deep += 1;
|
||||
|
||||
#pragma omp parallel
|
||||
#pragma omp master
|
||||
{
|
||||
MsParallel(array, tmp, true, 0, size, deep);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief program entry point
|
||||
*/
|
||||
int main(int argc, char* argv[]) {
|
||||
// variables to measure the elapsed time
|
||||
struct timeval t1, t2;
|
||||
double etime;
|
||||
|
||||
// expect one command line arguments: array size
|
||||
if (argc != 2) {
|
||||
printf("Usage: MergeSort.exe <array size> \n");
|
||||
printf("\n");
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
else {
|
||||
const size_t stSize = strtol(argv[1], NULL, 10);
|
||||
int *data = (int*) malloc(stSize * sizeof(int));
|
||||
int *tmp = (int*) malloc(stSize * sizeof(int));
|
||||
int *ref = (int*) malloc(stSize * sizeof(int));
|
||||
|
||||
// first touch
|
||||
#pragma omp parallel for
|
||||
for (size_t idx = 0; idx < stSize; ++idx){
|
||||
data[idx] = 0;
|
||||
tmp[idx] = 0;
|
||||
}
|
||||
|
||||
printf("Initialization...\n");
|
||||
|
||||
srand(95);
|
||||
for (size_t idx = 0; idx < stSize; ++idx){
|
||||
data[idx] = (int) (stSize * (double(rand()) / RAND_MAX));
|
||||
}
|
||||
std::copy(data, data + stSize, ref);
|
||||
|
||||
double dSize = (stSize * sizeof(int)) / 1024 / 1024;
|
||||
printf("Sorting %zu elements of type int (%f MiB)...\n", stSize, dSize);
|
||||
|
||||
gettimeofday(&t1, NULL);
|
||||
MsParallelOmp(data, tmp, stSize);
|
||||
gettimeofday(&t2, NULL);
|
||||
|
||||
etime = (t2.tv_sec - t1.tv_sec) * 1000 + (t2.tv_usec - t1.tv_usec) / 1000;
|
||||
etime = etime / 1000;
|
||||
|
||||
printf("done, took %f sec. Verification...", etime);
|
||||
if (isSorted(ref, data, stSize)) {
|
||||
printf(" successful.\n");
|
||||
}
|
||||
else {
|
||||
printf(" FAILED.\n");
|
||||
}
|
||||
|
||||
free(data);
|
||||
free(tmp);
|
||||
free(ref);
|
||||
}
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
174
03-cut-offs/mergesort-co/mergesort.cpp
Normal file
174
03-cut-offs/mergesort-co/mergesort.cpp
Normal file
|
@ -0,0 +1,174 @@
|
|||
/*
|
||||
* This file is part of Christian's OpenMP software lab
|
||||
*
|
||||
* Copyright (C) 2016 by Christian Terboven <terboven@itc.rwth-aachen.de>
|
||||
* Copyright (C) 2016 by Jonas Hahnfeld <hahnfeld@itc.rwth-aachen.de>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <errno.h>
|
||||
#include <sys/time.h>
|
||||
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
|
||||
#include <cstdlib>
|
||||
#include <cstdio>
|
||||
|
||||
#include <cmath>
|
||||
#include <ctime>
|
||||
#include <cstring>
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* helper routine: check if array is sorted correctly
|
||||
*/
|
||||
bool isSorted(int ref[], int data[], const size_t size){
|
||||
std::sort(ref, ref + size);
|
||||
for (size_t idx = 0; idx < size; ++idx){
|
||||
if (ref[idx] != data[idx]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* sequential merge step (straight-forward implementation)
|
||||
*/
|
||||
// TODO: cut-off could also apply here (extra parameter?)
|
||||
// TODO: optional: we can also break merge in two halves
|
||||
void MsMergeSequential(int *out, int *in, long begin1, long end1, long begin2, long end2, long outBegin) {
|
||||
long left = begin1;
|
||||
long right = begin2;
|
||||
|
||||
long idx = outBegin;
|
||||
|
||||
while (left < end1 && right < end2) {
|
||||
if (in[left] <= in[right]) {
|
||||
out[idx] = in[left];
|
||||
left++;
|
||||
} else {
|
||||
out[idx] = in[right];
|
||||
right++;
|
||||
}
|
||||
idx++;
|
||||
}
|
||||
|
||||
while (left < end1) {
|
||||
out[idx] = in[left];
|
||||
left++, idx++;
|
||||
}
|
||||
|
||||
while (right < end2) {
|
||||
out[idx] = in[right];
|
||||
right++, idx++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* sequential MergeSort
|
||||
*/
|
||||
// TODO: remember one additional parameter (depth)
|
||||
// TODO: recursive calls could be taskyfied
|
||||
// TODO: task synchronization also is required
|
||||
void MsSequential(int *array, int *tmp, bool inplace, long begin, long end) {
|
||||
if (begin < (end - 1)) {
|
||||
const long half = (begin + end) / 2;
|
||||
MsSequential(array, tmp, !inplace, begin, half);
|
||||
MsSequential(array, tmp, !inplace, half, end);
|
||||
if (inplace) {
|
||||
MsMergeSequential(array, tmp, begin, half, half, end, begin);
|
||||
} else {
|
||||
MsMergeSequential(tmp, array, begin, half, half, end, begin);
|
||||
}
|
||||
} else if (!inplace) {
|
||||
tmp[begin] = array[begin];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Serial MergeSort
|
||||
*/
|
||||
// TODO: this function should create the parallel region
|
||||
// TODO: good point to compute a good depth level (cut-off)
|
||||
void MsSerial(int *array, int *tmp, const size_t size) {
|
||||
|
||||
// TODO: parallel version of MsSequential will receive one more parameter: 'depth' (used as cut-off)
|
||||
MsSequential(array, tmp, true, 0, size);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @brief program entry point
|
||||
*/
|
||||
int main(int argc, char* argv[]) {
|
||||
// variables to measure the elapsed time
|
||||
struct timeval t1, t2;
|
||||
double etime;
|
||||
|
||||
// expect one command line arguments: array size
|
||||
if (argc != 2) {
|
||||
printf("Usage: MergeSort.exe <array size> \n");
|
||||
printf("\n");
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
else {
|
||||
const size_t stSize = strtol(argv[1], NULL, 10);
|
||||
int *data = (int*) malloc(stSize * sizeof(int));
|
||||
int *tmp = (int*) malloc(stSize * sizeof(int));
|
||||
int *ref = (int*) malloc(stSize * sizeof(int));
|
||||
|
||||
printf("Initialization...\n");
|
||||
|
||||
srand(95);
|
||||
for (size_t idx = 0; idx < stSize; ++idx){
|
||||
data[idx] = (int) (stSize * (double(rand()) / RAND_MAX));
|
||||
}
|
||||
std::copy(data, data + stSize, ref);
|
||||
|
||||
double dSize = (stSize * sizeof(int)) / 1024 / 1024;
|
||||
printf("Sorting %zu elements of type int (%f MiB)...\n", stSize, dSize);
|
||||
|
||||
gettimeofday(&t1, NULL);
|
||||
MsSerial(data, tmp, stSize);
|
||||
gettimeofday(&t2, NULL);
|
||||
|
||||
etime = (t2.tv_sec - t1.tv_sec) * 1000 + (t2.tv_usec - t1.tv_usec) / 1000;
|
||||
etime = etime / 1000;
|
||||
|
||||
printf("done, took %f sec. Verification...", etime);
|
||||
if (isSorted(ref, data, stSize)) {
|
||||
printf(" successful.\n");
|
||||
}
|
||||
else {
|
||||
printf(" FAILED.\n");
|
||||
}
|
||||
|
||||
free(data);
|
||||
free(tmp);
|
||||
free(ref);
|
||||
}
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
29
03-cut-offs/sudoku-co/Makefile
Normal file
29
03-cut-offs/sudoku-co/Makefile
Normal file
|
@ -0,0 +1,29 @@
|
|||
BIN_DIR=../bin
|
||||
PROGRAM=$(BIN_DIR)/sudoku-co
|
||||
|
||||
CC=g++
|
||||
CFLAGS=-c -O3 -fopenmp -DUSE_SEQUENTIAL_CUTOFF
|
||||
LDFLAGS=-fopenmp
|
||||
|
||||
OBJS=SudokuBoard.o sudoku.o
|
||||
|
||||
all: $(PROGRAM)
|
||||
|
||||
$(PROGRAM): $(OBJS)
|
||||
$(CC) -o $@ $^ $(LDFLAGS)
|
||||
|
||||
%.o: %.cpp
|
||||
$(CC) $(CFLAGS) -o $@ $^
|
||||
|
||||
sudoku.cpp:
|
||||
@cat README.md
|
||||
@false
|
||||
|
||||
$(BIN_DIR):
|
||||
mkdir $@
|
||||
|
||||
clean:
|
||||
rm -rf $(PROGRAM) *.o
|
||||
|
||||
wipe: clean
|
||||
rm -rf *.out *.err
|
5
03-cut-offs/sudoku-co/README.md
Normal file
5
03-cut-offs/sudoku-co/README.md
Normal file
|
@ -0,0 +1,5 @@
|
|||
====================================================================================================
|
||||
Copy your previous version of 01-tasks/sudoku-tsk exercise (sudoku.c) and modify the code in order
|
||||
to use a native OpenMP cut-off mechanism (ie, the 'if' or the 'final' clauses) instead of the
|
||||
manual cut-off version implemented as: "USE_SEQUENTIAL_CUTOFF".
|
||||
====================================================================================================
|
119
03-cut-offs/sudoku-co/SudokuBoard.cpp
Normal file
119
03-cut-offs/sudoku-co/SudokuBoard.cpp
Normal file
|
@ -0,0 +1,119 @@
|
|||
/*
|
||||
* This file is part of Christian's OpenMP Task-parallel Sudoku Solver
|
||||
*
|
||||
* Copyright (C) 2013 by Christian Terboven <christian@terboven.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include "SudokuBoard.h"
|
||||
|
||||
#include <cstring>
|
||||
|
||||
CSudokuBoard::CSudokuBoard(int fsize, int bsize)
|
||||
: field_size(fsize), block_size(bsize), solutions(-1)
|
||||
{
|
||||
field = new int[field_size*field_size];
|
||||
}
|
||||
|
||||
|
||||
CSudokuBoard::CSudokuBoard(const CSudokuBoard& other)
|
||||
: field_size(other.getFieldSize()), block_size(other.getBlockSize()), solutions(other.getNumSolutions())
|
||||
{
|
||||
field = new int[field_size*field_size];
|
||||
std::memcpy(field, other.field, sizeof(int) * field_size*field_size);
|
||||
}
|
||||
|
||||
|
||||
CSudokuBoard::~CSudokuBoard(void)
|
||||
{
|
||||
delete[] field;
|
||||
}
|
||||
|
||||
|
||||
bool CSudokuBoard::loadFromFile(char *filename)
|
||||
{
|
||||
std::ifstream ifile(filename);
|
||||
|
||||
if (!ifile) {
|
||||
std::cout << "There was an error opening the input file " << filename << std::endl;
|
||||
std::cout << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 0; i < this->field_size; i++) {
|
||||
for (int j = 0; j < this->field_size; j++) {
|
||||
ifile >> this->field[ACCESS(i,j)];
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
void CSudokuBoard::printBoard()
|
||||
{
|
||||
for(int i = 0; i < field_size; i++) {
|
||||
for(int j = 0; j < field_size; j++) {
|
||||
std::cout << std::setw(3) <<
|
||||
this->field[ACCESS(i,j)]
|
||||
<< " ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
bool CSudokuBoard::check(int x, int y, int value)
|
||||
{
|
||||
if(checkHorizontal(y, value))
|
||||
return true;
|
||||
if(checkVertical(x, value))
|
||||
return true;
|
||||
if(checkBox(x, y, value))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
bool CSudokuBoard::checkHorizontal(int y, int value) {
|
||||
for(int i = 0; i < field_size; i++)
|
||||
if(field[ACCESS(y,i)] == value)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
bool CSudokuBoard::checkVertical(int x, int value) {
|
||||
for(int i = 0; i < field_size; i++)
|
||||
if(field[ACCESS(i,x)] == value)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
bool CSudokuBoard::checkBox(int x, int y, int value) {
|
||||
// find suitable box edge
|
||||
int x_box = (int)(x / block_size) * block_size;
|
||||
int y_box = (int)(y / block_size) * block_size;
|
||||
|
||||
for(int i = y_box; i < y_box + block_size; i++)
|
||||
for(int j = x_box; j < x_box + block_size; j++)
|
||||
if(field[ACCESS(i,j)] == value)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
96
03-cut-offs/sudoku-co/SudokuBoard.h
Normal file
96
03-cut-offs/sudoku-co/SudokuBoard.h
Normal file
|
@ -0,0 +1,96 @@
|
|||
/*
|
||||
* This file is part of Christian's OpenMP Task-parallel Sudoku Solver
|
||||
*
|
||||
* Copyright (C) 2013 by Christian Terboven <christian@terboven.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
#include <fstream>
|
||||
#include <vector>
|
||||
|
||||
#define ACCESS(x, y) (this->field_size*(x) + (y))
|
||||
|
||||
/**
|
||||
* @brief Representation of the Sudoku board, includes solver and utility functions
|
||||
*/
|
||||
class CSudokuBoard
|
||||
{
|
||||
public:
|
||||
CSudokuBoard(int fsize, int bsize);
|
||||
CSudokuBoard(const CSudokuBoard& other);
|
||||
~CSudokuBoard(void);
|
||||
|
||||
inline int getNumSolutions() const {
|
||||
return this->solutions;
|
||||
}
|
||||
|
||||
inline void incrementSolutionCounter() {
|
||||
if (this->solutions == -1)
|
||||
this->solutions = 1;
|
||||
else
|
||||
this->solutions++;
|
||||
}
|
||||
|
||||
inline int getFieldSize() const {
|
||||
return this->field_size;
|
||||
}
|
||||
|
||||
inline int getBlockSize() const {
|
||||
return this->block_size;
|
||||
}
|
||||
|
||||
inline int get(int x, int y) const {
|
||||
return this->field[ACCESS(x,y)];
|
||||
}
|
||||
|
||||
inline void set(int x, int y, int value) {
|
||||
this->field[ACCESS(x, y)] = value;
|
||||
}
|
||||
|
||||
/**
|
||||
* Read Sudoku template from file
|
||||
* @param filename name of file to read input from
|
||||
* @return true if reading file was successful, false if not
|
||||
*/
|
||||
bool loadFromFile(char *filename);
|
||||
|
||||
/**
|
||||
* Print the Sudoku board to stdout
|
||||
*/
|
||||
void printBoard();
|
||||
|
||||
/**
|
||||
* Check whether value already used
|
||||
* @return true if already used, false if not
|
||||
*/
|
||||
bool check(int x, int y, int value);
|
||||
|
||||
|
||||
private:
|
||||
bool checkHorizontal(int y, int value);
|
||||
bool checkVertical(int x, int value);
|
||||
bool checkBox(int x, int y, int value);
|
||||
|
||||
int field_size;
|
||||
int block_size;
|
||||
|
||||
int *field;
|
||||
|
||||
int solutions;
|
||||
};
|
177
03-cut-offs/sudoku-co/sudoku-solved.cpp
Normal file
177
03-cut-offs/sudoku-co/sudoku-solved.cpp
Normal file
|
@ -0,0 +1,177 @@
|
|||
/*
|
||||
* This file is part of Christian's OpenMP Task-parallel Sudoku Solver
|
||||
*
|
||||
* Copyright (C) 2013 by Christian Terboven <christian@terboven.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <omp.h>
|
||||
|
||||
#include "SudokuBoard.h"
|
||||
|
||||
|
||||
int found_sudokus = 0;
|
||||
|
||||
|
||||
/**
|
||||
* Solve the Sudoku puzzle by finding and counting all solutions (if all are requested)
|
||||
*/
|
||||
bool solve(int x, int y, CSudokuBoard* sudoku, bool printAnyFoundSolution = true, bool findAllSolutions = true)
|
||||
{
|
||||
if(x == sudoku->getFieldSize()) { // end of line
|
||||
y++;
|
||||
x = 0;
|
||||
if(y == sudoku->getFieldSize()) // end
|
||||
return true;
|
||||
}
|
||||
|
||||
if(sudoku->get(y, x) > 0) // field already set
|
||||
return solve(x+1, y, sudoku, printAnyFoundSolution, findAllSolutions); // tackle next field
|
||||
|
||||
for(int i = 1; i <= sudoku->getFieldSize(); i++) { // try all numbers
|
||||
if(!sudoku->check(x, y, i)) {
|
||||
sudoku->set(y, x, i); // if number fits, set it
|
||||
if(solve(x+1, y, sudoku, printAnyFoundSolution, findAllSolutions)) { // tackle next field
|
||||
#pragma omp atomic
|
||||
found_sudokus++;
|
||||
sudoku->incrementSolutionCounter(); // solution found :-)
|
||||
if (printAnyFoundSolution) {
|
||||
std::cout << "The following is a valid solution:" << std::endl;
|
||||
sudoku->printBoard();
|
||||
std::cout << std::endl;
|
||||
}
|
||||
if (!findAllSolutions) {
|
||||
return true; // return, as only one solution was asked for
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sudoku->set(y, x, 0); // no solution found, reset field
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Solve the Sudoku puzzle in parallel by finding and counting all solutions
|
||||
*/
|
||||
bool solve_parallel(int x, int y, CSudokuBoard* sudoku, bool printAnyFoundSolution = true)
|
||||
{
|
||||
if (x == sudoku->getFieldSize()) { // end of line
|
||||
y++;
|
||||
x = 0;
|
||||
if(y == sudoku->getFieldSize()) // end
|
||||
return true;
|
||||
}
|
||||
|
||||
if (sudoku->get(y, x) > 0) { // field already set
|
||||
return solve_parallel(x+1, y, sudoku, printAnyFoundSolution); // tackle next field
|
||||
}
|
||||
|
||||
//#if USE_SEQUENTIAL_CUTOFF
|
||||
// if ( y > 1 ) {
|
||||
// return solve(x, y, sudoku, printAnyFoundSolution);
|
||||
// }
|
||||
//#endif
|
||||
|
||||
for (int i = 1; i <= sudoku->getFieldSize(); i++) { // try all numbers
|
||||
if (!sudoku->check(x, y, i)) {
|
||||
#pragma omp task firstprivate(i,x,y,sudoku) final(y > 0)
|
||||
{
|
||||
CSudokuBoard new_sudoku(*sudoku);
|
||||
new_sudoku.set(y, x, i); // if number fits, set it
|
||||
|
||||
if (solve_parallel(x+1, y, &new_sudoku, printAnyFoundSolution)) { // tackle next field
|
||||
#pragma omp atomic
|
||||
found_sudokus++;
|
||||
// sudoku->incrementSolutionCounter(); // solution found :-)
|
||||
if (printAnyFoundSolution) {
|
||||
std::cout << "The following is a valid solution:" << std::endl;
|
||||
new_sudoku.printBoard();
|
||||
std::cout << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#pragma omp taskwait
|
||||
sudoku->set(y, x, 0); // no solution found, reset field
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/** @brief program entry point
|
||||
*/
|
||||
int main(int argc, char* argv[]) {
|
||||
// measure the time
|
||||
double t3, t4;
|
||||
int nthreads = 0;
|
||||
|
||||
// expect three command line arguments: field size, block size, and input file
|
||||
if (argc != 4) {
|
||||
std::cout << "Usage: sudoku.exe <field size> <block size> <input filename>" << std::endl;
|
||||
std::cout << std::endl;
|
||||
return -1;
|
||||
}
|
||||
else {
|
||||
CSudokuBoard *sudoku1 = new CSudokuBoard(atoi(argv[1]), atoi(argv[2]));
|
||||
if (!sudoku1->loadFromFile(argv[3])) {
|
||||
std::cout << "There was an error reading a Sudoku template from " << argv[3] << std::endl;
|
||||
std::cout << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
// print the Sudoku board template
|
||||
std::cout << "Given Sudoku template" << std::endl;
|
||||
sudoku1->printBoard();
|
||||
|
||||
// solve the Sudoku by finding (and printing) all solutions
|
||||
t3 = omp_get_wtime();
|
||||
#pragma omp parallel sections
|
||||
{
|
||||
solve_parallel(0, 0, sudoku1, false);
|
||||
}
|
||||
t4 = omp_get_wtime();
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
#pragma omp master
|
||||
nthreads = omp_get_num_threads();
|
||||
}
|
||||
|
||||
std::cout << std::endl;
|
||||
std::cout << "In total there were " << sudoku1->getNumSolutions() << " solutions found in serial." << std::endl;
|
||||
std::cout << "In total there were " << found_sudokus << " solutions found in parallel " << std::endl;
|
||||
std::cout << std::endl;
|
||||
|
||||
delete sudoku1;
|
||||
}
|
||||
|
||||
// print the time
|
||||
std::cout << "Parallel computation took " << t4 - t3 << " seconds ("
|
||||
<< nthreads << " threads)." << std::endl << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
23
04-matmul/Makefile
Normal file
23
04-matmul/Makefile
Normal file
|
@ -0,0 +1,23 @@
|
|||
BIN_DIR=.
|
||||
PROGRAM=$(BIN_DIR)/matmul
|
||||
|
||||
MKL_HOME=$(MKLROOT)
|
||||
|
||||
CC=gcc
|
||||
CFLAGS=-O3 -std=gnu99 -fopenmp
|
||||
LDFLAGS=-fopenmp
|
||||
#VERBOSE=-DVERBOSE
|
||||
|
||||
all: $(PROGRAM)
|
||||
|
||||
$(PROGRAM): matmul.c
|
||||
$(CC) $(CFLAGS) $(VERBOSE) -o $@ $^ $(LDFLAGS)
|
||||
|
||||
$(BIN_DIR):
|
||||
mkdir $@
|
||||
|
||||
clean:
|
||||
rm -rf $(PROGRAM) *.o
|
||||
|
||||
wipe: clean
|
||||
rm -rf *.out *.err
|
174
04-matmul/matmul.c
Normal file
174
04-matmul/matmul.c
Normal file
|
@ -0,0 +1,174 @@
|
|||
//===-- matmul.c - Different implementations of matrix multiplies -*- C -*-===//
|
||||
//
|
||||
// Part of the LOMP Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <omp.h>
|
||||
|
||||
#define N 3072
|
||||
|
||||
#define DUMP_MATRIX 0
|
||||
#define DUMP_TASKS 0
|
||||
|
||||
void matmul_seq(float * C, float * A, float * B, size_t n) {
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
for (size_t k = 0; k < n; ++k) {
|
||||
for (size_t j = 0; j < n; ++j) {
|
||||
C[i * n + j] += A[i * n + k] * B[k * n + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void matmul_par(float * C, float * A, float * B, size_t n) {
|
||||
#pragma omp parallel for schedule(static,8) firstprivate(n)
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
for (size_t k = 0; k < n; ++k) {
|
||||
for (size_t j = 0; j < n; ++j) {
|
||||
C[i * n + j] += A[i * n + k] * B[k * n + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void matmul_tloop(float * C, float * A, float * B, size_t n) {
|
||||
#pragma omp parallel firstprivate(n)
|
||||
#pragma omp single nowait
|
||||
#pragma omp taskloop
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
for (size_t k = 0; k < n; ++k) {
|
||||
for (size_t j = 0; j < n; ++j) {
|
||||
C[i * n + j] += A[i * n + k] * B[k * n + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void matmul_task(float * C, float * A, float * B, size_t n) {
|
||||
const size_t bf = 256;
|
||||
if (n % bf != 0) {
|
||||
printf("Blocking factor does not divide matrix size!\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
#pragma omp parallel firstprivate(n, bf)
|
||||
#pragma omp master
|
||||
{
|
||||
// work on the blocks of the matrix
|
||||
for (size_t ib = 0; ib < n; ib += bf)
|
||||
for (size_t kb = 0; kb < n; kb += bf)
|
||||
for (size_t jb = 0; jb < n; jb += bf) {
|
||||
#pragma omp task firstprivate(ib, kb, jb) firstprivate(n, bf) \
|
||||
depend(inout:C[ib * n + jb:bf]) \
|
||||
depend(in:A[ib * n + kb:bf]) \
|
||||
depend(in:B[kb * n + jb:bf])
|
||||
{
|
||||
#if DUMP_TASKS
|
||||
printf("task: C[%ld,%ld] += A[%ld,%ld] o B[%ld,%ld]\n", ib, jb, ib, kb, kb, jb);
|
||||
#endif
|
||||
// work on a single block``
|
||||
for (size_t i = ib; i < (ib + bf); ++i)
|
||||
for (size_t k = kb; k < (kb + bf); ++k)
|
||||
for (size_t j = jb; j < (jb + bf); ++j)
|
||||
C[i * n + j] += A[i * n + k] * B[k * n + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void init_mat(float * C, float * A, float * B, size_t n) {
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
for (size_t j = 0; j < n; ++j) {
|
||||
C[i * n + j] = 0.0;
|
||||
A[i * n + j] = 0.5;
|
||||
B[i * n + j] = 0.25;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void dump_mat(float * mtx, size_t n) {
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
for (size_t j = 0; j < n; ++j) {
|
||||
printf("%f ", mtx[i * n + j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
float sum_mat(float * mtx, size_t n) {
|
||||
float sum = 0.0;
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
for (size_t j = 0; j < n; ++j) {
|
||||
sum += mtx[i * n + j];
|
||||
}
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
int main(int argc, char * argv[]) {
|
||||
double ts, te;
|
||||
double t_seq;
|
||||
|
||||
float * C;
|
||||
float * A;
|
||||
float * B;
|
||||
|
||||
// If number of arguments is not 1, print help
|
||||
if (argc != 2) {
|
||||
printf("%s: matrix_size\n", argv[0]);
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
const int n = atoi(argv[1]); // matrix size
|
||||
|
||||
C = (float *) malloc(sizeof(*C) * n * n);
|
||||
A = (float *) malloc(sizeof(*A) * n * n);
|
||||
B = (float *) malloc(sizeof(*B) * n * n);
|
||||
|
||||
init_mat(C, A, B, n);
|
||||
ts = omp_get_wtime();
|
||||
matmul_seq(C, A, B, n);
|
||||
te = omp_get_wtime();
|
||||
#if DUMP_MATRIX
|
||||
dump_mat(C, n);
|
||||
#endif
|
||||
t_seq = te - ts;
|
||||
printf("Sum of matrix (serial): %f, wall time %lf, speed-up %.2lf\n",
|
||||
sum_mat(C, n), (te-ts), t_seq / (te-ts));
|
||||
|
||||
init_mat(C, A, B, n);
|
||||
ts = omp_get_wtime();
|
||||
matmul_par(C, A, B, n);
|
||||
te = omp_get_wtime();
|
||||
#if DUMP_MATRIX
|
||||
dump_mat(C, n);
|
||||
#endif
|
||||
printf("Sum of matrix (parallel): %f, wall time %lf, speed-up %.2lf\n",
|
||||
sum_mat(C, n), (te-ts), t_seq / (te-ts));
|
||||
|
||||
init_mat(C, A, B, n);
|
||||
ts = omp_get_wtime();
|
||||
matmul_tloop(C, A, B, n);
|
||||
te = omp_get_wtime();
|
||||
#if DUMP_MATRIX
|
||||
dump_mat(C, n);
|
||||
#endif
|
||||
printf("Sum of matrix (taskloop): %f, wall time %lf, speed-up %.2lf\n",
|
||||
sum_mat(C, n), (te-ts), t_seq / (te-ts));
|
||||
|
||||
init_mat(C, A, B, n);
|
||||
ts = omp_get_wtime();
|
||||
matmul_task(C, A, B, n);
|
||||
te = omp_get_wtime();
|
||||
#if DUMP_MATRIX
|
||||
dump_mat(C, n);
|
||||
#endif
|
||||
printf("Sum of matrix (tasks): %f, wall time %lf, speed-up %.2lf\n",
|
||||
sum_mat(C, n), (te-ts), t_seq / (te-ts));
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
165
04-matmul/solution/matmul.c
Normal file
165
04-matmul/solution/matmul.c
Normal file
|
@ -0,0 +1,165 @@
|
|||
//===-- matmul.c - Different implementations of matrix multiplies -*- C -*-===//
|
||||
//
|
||||
// Part of the LOMP Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <omp.h>
|
||||
|
||||
#define N 3072
|
||||
|
||||
#define DUMP_MATRIX 0
|
||||
#define DUMP_TASKS 0
|
||||
|
||||
void matmul_seq(float * C, float * A, float * B, size_t n) {
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
for (size_t k = 0; k < n; ++k) {
|
||||
for (size_t j = 0; j < n; ++j) {
|
||||
C[i * n + j] += A[i * n + k] * B[k * n + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void matmul_par(float * C, float * A, float * B, size_t n) {
|
||||
#pragma omp parallel for schedule(static,8) firstprivate(n)
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
for (size_t k = 0; k < n; ++k) {
|
||||
for (size_t j = 0; j < n; ++j) {
|
||||
C[i * n + j] += A[i * n + k] * B[k * n + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void matmul_tloop(float * C, float * A, float * B, size_t n) {
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
for (size_t k = 0; k < n; ++k) {
|
||||
for (size_t j = 0; j < n; ++j) {
|
||||
C[i * n + j] += A[i * n + k] * B[k * n + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void matmul_task(float * C, float * A, float * B, size_t n) {
|
||||
const size_t bf = 256;
|
||||
if (n % bf != 0) {
|
||||
printf("Blocking factor does not divide matrix size!\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
{
|
||||
// work on the blocks of the matrix
|
||||
for (size_t ib = 0; ib < n; ib += bf)
|
||||
for (size_t kb = 0; kb < n; kb += bf)
|
||||
for (size_t jb = 0; jb < n; jb += bf) {
|
||||
{
|
||||
#if DUMP_TASKS
|
||||
printf("task: C[%ld,%ld] += A[%ld,%ld] o B[%ld,%ld]\n", ib, jb, ib, kb, kb, jb);
|
||||
#endif
|
||||
// work on a single block``
|
||||
for (size_t i = ib; i < (ib + bf); ++i)
|
||||
for (size_t k = kb; k < (kb + bf); ++k)
|
||||
for (size_t j = jb; j < (jb + bf); ++j)
|
||||
C[i * n + j] += A[i * n + k] * B[k * n + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void init_mat(float * C, float * A, float * B, size_t n) {
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
for (size_t j = 0; j < n; ++j) {
|
||||
C[i * n + j] = 0.0;
|
||||
A[i * n + j] = 0.5;
|
||||
B[i * n + j] = 0.25;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void dump_mat(float * mtx, size_t n) {
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
for (size_t j = 0; j < n; ++j) {
|
||||
printf("%f ", mtx[i * n + j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
float sum_mat(float * mtx, size_t n) {
|
||||
float sum = 0.0;
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
for (size_t j = 0; j < n; ++j) {
|
||||
sum += mtx[i * n + j];
|
||||
}
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
int main(int argc, char * argv[]) {
|
||||
double ts, te;
|
||||
double t_seq;
|
||||
|
||||
float * C;
|
||||
float * A;
|
||||
float * B;
|
||||
|
||||
// If number of arguments is not 1, print help
|
||||
if (argc != 2) {
|
||||
printf("%s: matrix_size\n", argv[0]);
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
const int n = atoi(argv[1]); // matrix size
|
||||
|
||||
C = (float *) malloc(sizeof(*C) * n * n);
|
||||
A = (float *) malloc(sizeof(*A) * n * n);
|
||||
B = (float *) malloc(sizeof(*B) * n * n);
|
||||
|
||||
init_mat(C, A, B, n);
|
||||
ts = omp_get_wtime();
|
||||
matmul_seq(C, A, B, n);
|
||||
te = omp_get_wtime();
|
||||
#if DUMP_MATRIX
|
||||
dump_mat(C, n);
|
||||
#endif
|
||||
t_seq = te - ts;
|
||||
printf("Sum of matrix (serial): %f, wall time %lf, speed-up %.2lf\n",
|
||||
sum_mat(C, n), (te-ts), t_seq / (te-ts));
|
||||
|
||||
init_mat(C, A, B, n);
|
||||
ts = omp_get_wtime();
|
||||
matmul_par(C, A, B, n);
|
||||
te = omp_get_wtime();
|
||||
#if DUMP_MATRIX
|
||||
dump_mat(C, n);
|
||||
#endif
|
||||
printf("Sum of matrix (parallel): %f, wall time %lf, speed-up %.2lf\n",
|
||||
sum_mat(C, n), (te-ts), t_seq / (te-ts));
|
||||
|
||||
init_mat(C, A, B, n);
|
||||
ts = omp_get_wtime();
|
||||
matmul_tloop(C, A, B, n);
|
||||
te = omp_get_wtime();
|
||||
#if DUMP_MATRIX
|
||||
dump_mat(C, n);
|
||||
#endif
|
||||
printf("Sum of matrix (taskloop): %f, wall time %lf, speed-up %.2lf\n",
|
||||
sum_mat(C, n), (te-ts), t_seq / (te-ts));
|
||||
|
||||
init_mat(C, A, B, n);
|
||||
ts = omp_get_wtime();
|
||||
matmul_task(C, A, B, n);
|
||||
te = omp_get_wtime();
|
||||
#if DUMP_MATRIX
|
||||
dump_mat(C, n);
|
||||
#endif
|
||||
printf("Sum of matrix (tasks): %f, wall time %lf, speed-up %.2lf\n",
|
||||
sum_mat(C, n), (te-ts), t_seq / (te-ts));
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
20
07-fibonacci/Makefile
Normal file
20
07-fibonacci/Makefile
Normal file
|
@ -0,0 +1,20 @@
|
|||
PROG=fibonacci
|
||||
|
||||
CC=gcc
|
||||
CCFLAGS=-O3 -fopenmp
|
||||
LDFLAGS=-fopenmp
|
||||
|
||||
debug:
|
||||
$(CC) $(CCFLAGS) -g -O0 ${PROG}.c -o ${PROG}.exe
|
||||
|
||||
release:
|
||||
$(CC) $(CCFLAGS) -O3 ${PROG}.c -o ${PROG}.exe
|
||||
|
||||
solution:
|
||||
$(CC) $(CCFLAGS) -O3 ${PROG}-solved.c -o ${PROG}-solved.exe
|
||||
|
||||
run:
|
||||
./${PROG}.exe < input
|
||||
|
||||
clean:
|
||||
rm -f ${PROG}.exe ${PROG}-solved.exe ${PROG}.o *~
|
79
07-fibonacci/fibonacci-solved.c
Normal file
79
07-fibonacci/fibonacci-solved.c
Normal file
|
@ -0,0 +1,79 @@
|
|||
/*
|
||||
* OpenMP lecture exercises
|
||||
* Copyright (C) 2011 by Christian Terboven <terboven@rz.rwth-aachen.de>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <omp.h>
|
||||
|
||||
int ser_fib(int n)
|
||||
{
|
||||
int x, y;
|
||||
if (n < 2)
|
||||
return n;
|
||||
|
||||
x = ser_fib(n - 1);
|
||||
|
||||
y = ser_fib(n - 2);
|
||||
|
||||
return x+y;
|
||||
}
|
||||
|
||||
int fib(int n)
|
||||
{
|
||||
int x, y;
|
||||
if (n < 2)
|
||||
return n;
|
||||
else if (n < 30)
|
||||
return ser_fib(n);
|
||||
|
||||
#pragma omp task shared(x)
|
||||
{
|
||||
x = fib(n - 1);
|
||||
}
|
||||
|
||||
#pragma omp task shared(y)
|
||||
{
|
||||
y = fib(n - 2);
|
||||
}
|
||||
|
||||
#pragma omp taskwait
|
||||
|
||||
return x+y;
|
||||
|
||||
}
|
||||
|
||||
|
||||
int main()
|
||||
{
|
||||
int n,fibonacci;
|
||||
double starttime;
|
||||
printf("Please insert n, to calculate fib(n): \n");
|
||||
scanf("%d",&n);
|
||||
starttime=omp_get_wtime();
|
||||
|
||||
#pragma omp parallel
|
||||
#pragma omp single
|
||||
{
|
||||
fibonacci=fib(n);
|
||||
}
|
||||
|
||||
printf("fib(%d)=%d \n",n,fibonacci);
|
||||
printf("calculation took %lf sec\n",omp_get_wtime()-starttime);
|
||||
return 0;
|
||||
}
|
53
07-fibonacci/fibonacci.c
Normal file
53
07-fibonacci/fibonacci.c
Normal file
|
@ -0,0 +1,53 @@
|
|||
/*
|
||||
* OpenMP lecture exercises
|
||||
* Copyright (C) 2011 by Christian Terboven <terboven@rz.rwth-aachen.de>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <omp.h>
|
||||
|
||||
|
||||
int fib(int n)
|
||||
{
|
||||
int x, y;
|
||||
if (n < 2)
|
||||
return n;
|
||||
|
||||
x = fib(n - 1);
|
||||
|
||||
y = fib(n - 2);
|
||||
|
||||
return x+y;
|
||||
|
||||
}
|
||||
|
||||
|
||||
int main()
|
||||
{
|
||||
int n,fibonacci;
|
||||
double starttime;
|
||||
printf("Please insert n, to calculate fib(n): \n");
|
||||
scanf("%d",&n);
|
||||
starttime=omp_get_wtime();
|
||||
|
||||
fibonacci=fib(n);
|
||||
|
||||
printf("fib(%d)=%d \n",n,fibonacci);
|
||||
printf("calculation took %lf sec\n",omp_get_wtime()-starttime);
|
||||
return 0;
|
||||
}
|
1
07-fibonacci/input
Normal file
1
07-fibonacci/input
Normal file
|
@ -0,0 +1 @@
|
|||
35
|
20
08-quicksort/Makefile
Normal file
20
08-quicksort/Makefile
Normal file
|
@ -0,0 +1,20 @@
|
|||
PROG=quicksort
|
||||
|
||||
CXX = g++
|
||||
CXXFLAGS=-O3 -fopenmp
|
||||
LDFLAGS=-fopenmp
|
||||
|
||||
debug:
|
||||
$(CXX) $(CXXFLAGS) -g -O0 ${PROG}.cpp -o ${PROG}.exe
|
||||
|
||||
release:
|
||||
$(CXX) $(CXXFLAGS) -O3 ${PROG}.cpp -o ${PROG}.exe
|
||||
|
||||
solution:
|
||||
$(CXX) $(CXXFLAGS) -O3 ${PROG}-solved.cpp -o ${PROG}-solved.exe
|
||||
|
||||
run:
|
||||
./${PROG}.exe
|
||||
|
||||
clean:
|
||||
rm -f ${PROG}.exe ${PROG}-solved.exe ${PROG}.o *~
|
98
08-quicksort/quicksort-solved.cpp
Normal file
98
08-quicksort/quicksort-solved.cpp
Normal file
|
@ -0,0 +1,98 @@
|
|||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
#include <ctime>
|
||||
#include <omp.h>
|
||||
#include <stdio.h>
|
||||
using namespace std;
|
||||
|
||||
void init(int* array,int length){
|
||||
srand((unsigned)time(0));
|
||||
for(int i=0;i<length;i++)
|
||||
array[i]=rand()%1234 + 1;
|
||||
}
|
||||
|
||||
void swap(int &a, int &b){
|
||||
int temp=a;
|
||||
a=b;
|
||||
b=temp;
|
||||
}
|
||||
|
||||
|
||||
int pivot(int* array, int first, int last){
|
||||
int p = first;
|
||||
int pivotElement = array[first];
|
||||
for(int i=first+1;i<=last;i++){
|
||||
if(array[i]<=pivotElement){
|
||||
p++;
|
||||
swap(array[i],array[p]);
|
||||
}
|
||||
}
|
||||
swap(array[p],array[first]);
|
||||
return p;
|
||||
}
|
||||
|
||||
void serial_quicksort(int * array, int first, int last){
|
||||
int pivotElement;
|
||||
if(first<last){
|
||||
pivotElement = pivot(array,first,last);
|
||||
serial_quicksort(array,first,pivotElement-1);
|
||||
serial_quicksort(array,pivotElement+1,last);
|
||||
}
|
||||
}
|
||||
|
||||
void quicksort(int * array, int first, int last){
|
||||
int pivotElement;
|
||||
if((last - first + 1) < 10000) {
|
||||
serial_quicksort(array, first, last);
|
||||
} else {
|
||||
pivotElement = pivot(array,first,last);
|
||||
#pragma omp task default(shared)
|
||||
{
|
||||
quicksort(array,first,pivotElement-1);
|
||||
}
|
||||
#pragma omp task default(shared)
|
||||
{
|
||||
quicksort(array,pivotElement+1,last);
|
||||
}
|
||||
#pragma omp taskwait
|
||||
}
|
||||
}
|
||||
|
||||
bool checkFn(int * array,int length){
|
||||
for(int i=0;i<length-1;i++){
|
||||
if(array[i]>array[i+1]){
|
||||
cout << "array[" << i << "] > array[" << i+1 << "]" << endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]){
|
||||
int length;
|
||||
int *toBeSorted;
|
||||
double t1,t2;
|
||||
if(argc < 2){
|
||||
length=6000000;
|
||||
}else{
|
||||
length=atoi(argv[1]);
|
||||
}
|
||||
toBeSorted = new int[length];
|
||||
|
||||
init(toBeSorted, length);
|
||||
cout << "Sorting an array of " << length << " elements." << endl;
|
||||
|
||||
t1=omp_get_wtime();
|
||||
#pragma omp parallel
|
||||
#pragma omp single
|
||||
{
|
||||
quicksort(toBeSorted,0,length-1);
|
||||
}
|
||||
t2=omp_get_wtime()-t1;
|
||||
|
||||
cout << "quicksort took " << t2 << " sec. to complete" << endl;
|
||||
if (!checkFn(toBeSorted, length)) {
|
||||
cout << "validation failed!" << endl;
|
||||
}
|
||||
delete [] toBeSorted;
|
||||
}
|
76
08-quicksort/quicksort.cpp
Normal file
76
08-quicksort/quicksort.cpp
Normal file
|
@ -0,0 +1,76 @@
|
|||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
#include <ctime>
|
||||
#include <omp.h>
|
||||
#include <stdio.h>
|
||||
using namespace std;
|
||||
|
||||
void init(int* array,int length){
|
||||
srand((unsigned)time(0));
|
||||
for(int i=0;i<length;i++)
|
||||
array[i]=rand()%1234 + 1;
|
||||
}
|
||||
|
||||
void swap(int &a, int &b){
|
||||
int temp=a;
|
||||
a=b;
|
||||
b=temp;
|
||||
}
|
||||
|
||||
|
||||
int pivot(int* array, int first, int last){
|
||||
int p = first;
|
||||
int pivotElement = array[first];
|
||||
for(int i=first+1;i<=last;i++){
|
||||
if(array[i]<=pivotElement){
|
||||
p++;
|
||||
swap(array[i],array[p]);
|
||||
}
|
||||
}
|
||||
swap(array[p],array[first]);
|
||||
return p;
|
||||
}
|
||||
|
||||
void quicksort(int * array, int first, int last){
|
||||
int pivotElement;
|
||||
if(first<last){
|
||||
pivotElement = pivot(array,first,last);
|
||||
quicksort(array,first,pivotElement-1);
|
||||
quicksort(array,pivotElement+1,last);
|
||||
}
|
||||
}
|
||||
|
||||
bool checkFn(int * array,int length){
|
||||
for(int i=0;i<length-1;i++){
|
||||
if(array[i]>array[i+1]){
|
||||
cout << "array[" << i << "] > array[" << i+1 << "]" << endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]){
|
||||
int length;
|
||||
int *toBeSorted;
|
||||
double t1,t2;
|
||||
if(argc < 2){
|
||||
length=6000000;
|
||||
}else{
|
||||
length=atoi(argv[1]);
|
||||
}
|
||||
toBeSorted = new int[length];
|
||||
|
||||
init(toBeSorted, length);
|
||||
cout << "Sorting an array of " << length << " elements." << endl;
|
||||
|
||||
t1=omp_get_wtime();
|
||||
quicksort(toBeSorted,0,length-1);
|
||||
t2=omp_get_wtime()-t1;
|
||||
|
||||
cout << "quicksort took " << t2 << " sec. to complete" << endl;
|
||||
if (!checkFn(toBeSorted, length)) {
|
||||
cout << "validation failed!" << endl;
|
||||
}
|
||||
delete [] toBeSorted;
|
||||
}
|
31
Makefile
Normal file
31
Makefile
Normal file
|
@ -0,0 +1,31 @@
|
|||
ALL_RECURSIVE=all-recursive clean-recursive
|
||||
|
||||
TUT=SC19
|
||||
|
||||
DIRS=$(shell ls -d *-* | sed '/_/d' 2>/dev/null)
|
||||
|
||||
all: $(CONFIG) all-recursive
|
||||
clean: $(CONFIG) clean-recursive
|
||||
|
||||
$(ALL_RECURSIVE):
|
||||
@failcom='exit 1';\
|
||||
target=`echo $@ | sed s/-recursive//`; \
|
||||
for subdir in $(DIRS); do \
|
||||
(cd $$subdir && $(MAKE) $$target) || eval $$failcom; \
|
||||
done;
|
||||
|
||||
dist:
|
||||
@failcom='exit 1';\
|
||||
for subdir in $(DIRS); do \
|
||||
(cd $$subdir && $(MAKE) clean) || eval $$failcom; \
|
||||
done; \
|
||||
tar --exclude="*-solved*" -cvzf ../mt-openmp-$(TUT).tar.gz --transform s/mt-openmp/mt-openmp-$(TUT)/ ../mt-openmp/*
|
||||
|
||||
dist-solved:
|
||||
@failcom='exit 1';\
|
||||
for subdir in $(DIRS); do \
|
||||
(cd $$subdir && $(MAKE) clean) || eval $$failcom; \
|
||||
done; \
|
||||
tar -cvzf ../mt-openmp-$(TUT)-solved.tar.gz --transform s/mt-openmp/mt-openmp-$(TUT)-solved/ ../mt-openmp/*
|
||||
|
||||
|
BIN
slides/day-1/00-OpenMP_Introduction.pdf
Normal file
BIN
slides/day-1/00-OpenMP_Introduction.pdf
Normal file
Binary file not shown.
BIN
slides/day-1/01-OpenMP_Hands-on.pdf
Normal file
BIN
slides/day-1/01-OpenMP_Hands-on.pdf
Normal file
Binary file not shown.
BIN
slides/day-1/03-OpenMP-NUMA.pdf
Normal file
BIN
slides/day-1/03-OpenMP-NUMA.pdf
Normal file
Binary file not shown.
BIN
slides/day-2/04-OpenMP-Task-Intro.pdf
Normal file
BIN
slides/day-2/04-OpenMP-Task-Intro.pdf
Normal file
Binary file not shown.
BIN
slides/day-2/05-OpenMP-Task-Loops-Deps.pdf
Normal file
BIN
slides/day-2/05-OpenMP-Task-Loops-Deps.pdf
Normal file
Binary file not shown.
BIN
slides/day-2/06-OpenMP-SIMD.pdf
Normal file
BIN
slides/day-2/06-OpenMP-SIMD.pdf
Normal file
Binary file not shown.
BIN
slides/day-3/07-OpenMP-GPUs.pdf
Normal file
BIN
slides/day-3/07-OpenMP-GPUs.pdf
Normal file
Binary file not shown.
BIN
slides/day-3/08-Openmp-Misc-Advanced-Outlook.pdf
Normal file
BIN
slides/day-3/08-Openmp-Misc-Advanced-Outlook.pdf
Normal file
Binary file not shown.
Loading…
Reference in a new issue