Exercises and slides

This commit is contained in:
Jose Gracia 2024-04-26 15:47:20 +02:00
commit 8307cf55bf
70 changed files with 5333 additions and 0 deletions

14
00-worksharing/Makefile Normal file
View file

@ -0,0 +1,14 @@
ALL_RECURSIVE=all-recursive clean-recursive
DIRS=$(shell ls -d *-* | sed '/_/d' 2>/dev/null)
all: $(CONFIG) all-recursive
clean: $(CONFIG) clean-recursive
$(ALL_RECURSIVE):
@failcom='exit 1';\
target=`echo $@ | sed s/-recursive//`; \
for subdir in $(DIRS); do \
(cd $$subdir && $(MAKE) $$target) || eval $$failcom; \
done;

View file

View file

@ -0,0 +1,30 @@
#!/bin/bash
#SBATCH --job-name=cholesky
#SBATCH --workdir=.
#SBATCH --output=cholesky_%j.out
#SBATCH --error=cholesky_%j.err
#SBATCH --cpus-per-task=48
#SBATCH --ntasks=1
#SBATCH --time=00:30:00
#SBATCH --qos=debug
export IFS=";"
THREADS="01;02;04;06;08;10;12;14;16;18;20;22;24;26;28;30;32;34;36;38;40;42;44;46;48"
#THREADS="01;02;04;06;08;10;12"
#MSIZES="2048;4096;8192"
MSIZES="8192"
MSIZES="16384"
MSIZES="2048"
BSIZES="256"
for MS in $MSIZES; do
for BS in $BSIZES; do
for threads in $THREADS; do
OMP_NUM_THREADS=$threads ./cholesky-for $MS $BS 0
OMP_NUM_THREADS=$threads ./cholesky-for-opt $MS $BS 0
done
done
done

View file

@ -0,0 +1,23 @@
BIN_DIR=../bin
PROGRAM=$(BIN_DIR)/cholesky-for-opt
MKL_HOME=$(MKLROOT)
CC=gcc
CFLAGS=-O3 -std=gnu99 -I$(MKL_HOME)/include -fopenmp
LDFLAGS=-L$(MKL_HOME)/lib/intel64 -lmkl_sequential -lmkl_core -lmkl_rt -lpthread -lm -fopenmp
#VERBOSE=-DVERBOSE
all: $(PROGRAM)
$(PROGRAM): cholesky.c
$(CC) $(CFLAGS) $(VERBOSE) -o $@ $^ $(LDFLAGS)
$(BIN_DIR):
mkdir $@
clean:
rm -rf $(PROGRAM) *.o
wipe: clean
rm -rf *.out *.err

View file

@ -0,0 +1,289 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <assert.h>
#include <sys/time.h>
#include <sys/times.h>
#include <math.h>
#include <mkl.h>
#include "omp.h"
#if !defined(_OPENMP)
int omp_get_max_threads() { return 1; }
int omp_get_num_threads() { return 1; }
#endif
void cholesky(int ts, int nt, double* Ah[nt][nt])
{
#ifdef VERBOSE
printf("> Computing Cholesky Factorization: indirect blocked matrix...\n");
#endif
for (int k = 0; k < nt; k++) {
// Diagonal Block factorization: using LAPACK
LAPACKE_dpotrf(LAPACK_COL_MAJOR, 'L', ts, Ah[k][k], ts);
// Triangular systems
for (int i = k + 1; i < nt; i++) {
cblas_dtrsm(CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, ts, ts, 1.0, Ah[k][k], ts, Ah[k][i], ts);
}
// Update trailing matrix
for (int i = k + 1; i < nt; i++) {
for (int j = k + 1; j < i; j++) {
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, ts, ts, ts, -1.0, Ah[k][i], ts, Ah[k][j], ts, 1.0, Ah[j][i], ts);
}
cblas_dsyrk(CblasColMajor, CblasLower, CblasNoTrans, ts, ts, -1.0, Ah[k][i], ts, 1.0, Ah[i][i], ts);
}
}
#ifdef VERBOSE
printf("> ...end of Cholesky Factorization.\n");
#endif
}
float get_time()
{
static double gtod_ref_time_sec = 0.0;
struct timeval tv;
gettimeofday(&tv, NULL);
// If this is the first invocation of through dclock(), then initialize the
// "reference time" global variable to the seconds field of the tv struct.
if (gtod_ref_time_sec == 0.0) gtod_ref_time_sec = (double) tv.tv_sec;
// Normalize the seconds field of the tv struct so that it is relative to the
// "reference time" that was recorded during the first invocation of dclock().
const double norm_sec = (double) tv.tv_sec - gtod_ref_time_sec;
// Compute the number of seconds since the reference time.
const double t = norm_sec + tv.tv_usec * 1.0e-6;
return (float) t;
}
// Robust Check the factorization of the matrix A2
// Using directly Fortran services: dlacpy_, dtrmm, dlange_
static int check_factorization(int N, double *A1, double *A2, int LDA, char uplo, double eps)
{
#ifdef VERBOSE
printf("> Checking the Cholesky Factorization... \n");
#endif
char NORM = 'I', ALL = 'A', UP = 'U', LO = 'L', TR = 'T', NU = 'N', RI = 'R';
double *Residual = (double *) malloc(N*N*sizeof(double));
double *L1 = (double *) malloc(N*N*sizeof(double));
double *L2 = (double *) malloc(N*N*sizeof(double));
double *work = (double *) malloc(N*sizeof(double));
memset((void*)L1, 0, N*N*sizeof(double));
memset((void*)L2, 0, N*N*sizeof(double));
double alpha= 1.0;
dlacpy_(&ALL, &N, &N, A1, &LDA, Residual, &N);
/* Dealing with L'L or U'U */
if (uplo == 'U'){
dlacpy_(&UP, &N, &N, A2, &LDA, L1, &N);
dlacpy_(&UP, &N, &N, A2, &LDA, L2, &N);
dtrmm(&LO, &uplo, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
} else{
dlacpy_(&LO, &N, &N, A2, &LDA, L1, &N);
dlacpy_(&LO, &N, &N, A2, &LDA, L2, &N);
dtrmm(&RI, &LO, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
}
/* Compute the Residual || A -L'L|| */
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
Residual[j*N+i] = L2[j*N+i] - Residual[j*N+i];
double Rnorm = dlange_(&NORM, &N, &N, Residual, &N, work);
double Anorm = dlange_(&NORM, &N, &N, A1, &N, work);
#ifdef VERBOSE
printf("> - ||L'L-A||_oo/(||A||_oo.N.eps) = %e \n",Rnorm / (Anorm*N*eps));
#endif
const int info_factorization = isnan(Rnorm/(Anorm*N*eps)) || isinf(Rnorm/(Anorm*N*eps)) || (Rnorm/(Anorm*N*eps) > 60.0);
#ifdef VERBOSE
if ( info_factorization) printf("> - Factorization is suspicious!\n");
else printf("> - Factorization is CORRECT!\n");
#endif
free(Residual); free(L1); free(L2); free(work);
return info_factorization;
}
void initialize_matrix(const int n, const int ts, double *matrix)
{
#ifdef VERBOSE
printf("> Initializing matrix with random values...\n");
#endif
int ISEED[4] = {0,0,0,1};
int intONE=1;
for (int i = 0; i < n*n; i+=n) {
dlarnv_(&intONE, &ISEED[0], &n, &matrix[i]);
}
for (int i=0; i<n; i++) {
for (int j=0; j<n; j++) {
matrix[j*n+i] = matrix[j*n+i] + matrix[i*n+j];
matrix[i*n+j] = matrix[j*n+i];
}
}
// Diagonal values
for (int i = 0; i < n; i++) {
matrix[i*n+i] += (double) n;
}
}
void gather_block(const int N, const int ts, double *Alin, double *A)
{
for (int i = 0; i < ts; i++)
for (int j = 0; j < ts; j++) {
A[i*ts + j] = Alin[i*N + j];
}
}
void scatter_block(const int N, const int ts, double *A, double *Alin)
{
for (int i = 0; i < ts; i++)
for (int j = 0; j < ts; j++) {
Alin[i*N + j] = A[i*ts + j];
}
}
void convert_to_blocks(const int ts, const int DIM, const int N, double Alin[N][N], double *A[DIM][DIM])
{
#ifdef VERBOSE
printf("> Converting linear matrix to blocks...\n");
#endif
for (int i = 0; i < DIM; i++)
for (int j = 0; j < DIM; j++) {
gather_block ( N, ts, &Alin[i*ts][j*ts], A[i][j]);
}
}
void convert_to_linear(const int ts, const int DIM, const int N, double *A[DIM][DIM], double Alin[N][N])
{
#ifdef VERBOSE
printf("> Converting blocked matrix to linear...\n");
#endif
for (int i = 0; i < DIM; i++)
for (int j = 0; j < DIM; j++) {
scatter_block ( N, ts, A[i][j], (double *) &Alin[i*ts][j*ts]);
}
}
int main(int argc, char* argv[])
{
char *result[3] = {"n/a","pass","FAIL"};
const double eps = pow(2.0, -53);
// If number of arguments is not 4, print help
if ( argc != 4) {
printf( "%s: matrix_size block_size check[0|1]?\n", argv[0] );
exit( -1 );
}
const int n = atoi(argv[1]); // matrix size
const int ts = atoi(argv[2]); // tile size
int check = atoi(argv[3]); // check result?
// Compute number of tiles
const int nt = n / ts;
assert((nt*ts) == n); // tile size should divide size
// Allocate matrix
double * const matrix = (double *) malloc(n * n * sizeof(double));
assert(matrix != NULL);
// Initialize matrix
initialize_matrix(n, ts, matrix);
// Allocate original matrix, and duplicate it, for debugging purposes
double * const original_matrix = (double *) malloc(n * n * sizeof(double));
assert(original_matrix != NULL);
// Save a copy of the original matrix
for (int i = 0; i < n * n; i++ ) {
original_matrix[i] = matrix[i];
}
// Set version description: Indirect blocked matrix
const char *version = "I-Blocked matrix";
// Allocate blocked matrix
double *Ah[nt][nt];
for (int i = 0; i < nt; i++) {
for (int j = 0; j < nt; j++) {
Ah[i][j] = malloc(ts * ts * sizeof(double));
assert(Ah[i][j] != NULL);
}
}
// ---------------------------------------
// Convert, compute (time), and re-convert
// ---------------------------------------
convert_to_blocks(ts, nt, n, (double(*)[n]) matrix, Ah);
const float tref = get_time();
cholesky(ts, nt, (double* (*)[nt]) Ah);
const float time = get_time() - tref;
convert_to_linear(ts, nt, n, Ah, (double (*)[n]) matrix);
// Free blocked matrix
for (int i = 0; i < nt; i++) {
for (int j = 0; j < nt; j++) {
assert(Ah[i][j] != NULL);
free(Ah[i][j]);
}
}
// Check result, if requested
if ( check ) {
const char uplo = 'L';
if ( check_factorization( n, original_matrix, matrix, n, uplo, eps) ) check++;
}
// Free original matrix, not needed anymore
free(original_matrix);
// Compute GFLOPs (Not verified)
float gflops = (((1.0 / 3.0) * n * n * n) / ((time) * 1.0e+9));
// Print results
#ifdef VERBOSE
printf( "\n" );
printf( "============ CHOLESKY RESULTS ============\n" );
printf( " test %s\n", argv[0]);
printf( " version %s\n", version);
printf( " matrix size: %dx%d\n", n, n);
printf( " tile size: %dx%d\n", ts, ts);
printf( " number of threads: %d\n", omp_get_max_threads());
printf( " time (s): %f\n", time);
printf( " performance (gflops): %f\n", gflops);
printf( " check: %s\n", result[check]);
printf( "==========================================\n" );
#else
printf("test, %s, version, %s, n, %d, ts, %d, num_threads, %d, gflops, %f, time, %f, check, %s\n", argv[0], version, n, ts, omp_get_max_threads(), gflops, time, result[check]);
#endif
// Free matrix
free(matrix);
return 0;
}

View file

@ -0,0 +1,293 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <assert.h>
#include <sys/time.h>
#include <sys/times.h>
#include <math.h>
#include <mkl.h>
#include "omp.h"
#if !defined(_OPENMP)
int omp_get_max_threads() { return 1; }
int omp_get_num_threads() { return 1; }
#endif
void cholesky(int ts, int nt, double* Ah[nt][nt])
{
#ifdef VERBOSE
printf("> Computing Cholesky Factorization: indirect blocked matrix...\n");
#endif
for (int k = 0; k < nt; k++) {
// Diagonal Block factorization: using LAPACK
LAPACKE_dpotrf(LAPACK_COL_MAJOR, 'L', ts, Ah[k][k], ts);
// Triangular systems
#pragma omp parallel for
for (int i = k + 1; i < nt; i++) {
cblas_dtrsm(CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, ts, ts, 1.0, Ah[k][k], ts, Ah[k][i], ts);
}
// Update trailing matrix
#pragma omp parallel
for (int i = k + 1; i < nt; i++) {
#pragma omp for schedule(dynamic, 1) nowait
for (int j = k + 1; j < i; j++) {
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, ts, ts, ts, -1.0, Ah[k][i], ts, Ah[k][j], ts, 1.0, Ah[j][i], ts);
}
#pragma omp single nowait
cblas_dsyrk(CblasColMajor, CblasLower, CblasNoTrans, ts, ts, -1.0, Ah[k][i], ts, 1.0, Ah[i][i], ts);
}
}
#ifdef VERBOSE
printf("> ...end of Cholesky Factorization.\n");
#endif
}
float get_time()
{
static double gtod_ref_time_sec = 0.0;
struct timeval tv;
gettimeofday(&tv, NULL);
// If this is the first invocation of through dclock(), then initialize the
// "reference time" global variable to the seconds field of the tv struct.
if (gtod_ref_time_sec == 0.0) gtod_ref_time_sec = (double) tv.tv_sec;
// Normalize the seconds field of the tv struct so that it is relative to the
// "reference time" that was recorded during the first invocation of dclock().
const double norm_sec = (double) tv.tv_sec - gtod_ref_time_sec;
// Compute the number of seconds since the reference time.
const double t = norm_sec + tv.tv_usec * 1.0e-6;
return (float) t;
}
// Robust Check the factorization of the matrix A2
// Using directly Fortran services: dlacpy_, dtrmm, dlange_
static int check_factorization(int N, double *A1, double *A2, int LDA, char uplo, double eps)
{
#ifdef VERBOSE
printf("> Checking the Cholesky Factorization... \n");
#endif
char NORM = 'I', ALL = 'A', UP = 'U', LO = 'L', TR = 'T', NU = 'N', RI = 'R';
double *Residual = (double *) malloc(N*N*sizeof(double));
double *L1 = (double *) malloc(N*N*sizeof(double));
double *L2 = (double *) malloc(N*N*sizeof(double));
double *work = (double *) malloc(N*sizeof(double));
memset((void*)L1, 0, N*N*sizeof(double));
memset((void*)L2, 0, N*N*sizeof(double));
double alpha= 1.0;
dlacpy_(&ALL, &N, &N, A1, &LDA, Residual, &N);
/* Dealing with L'L or U'U */
if (uplo == 'U'){
dlacpy_(&UP, &N, &N, A2, &LDA, L1, &N);
dlacpy_(&UP, &N, &N, A2, &LDA, L2, &N);
dtrmm(&LO, &uplo, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
} else{
dlacpy_(&LO, &N, &N, A2, &LDA, L1, &N);
dlacpy_(&LO, &N, &N, A2, &LDA, L2, &N);
dtrmm(&RI, &LO, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
}
/* Compute the Residual || A -L'L|| */
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
Residual[j*N+i] = L2[j*N+i] - Residual[j*N+i];
double Rnorm = dlange_(&NORM, &N, &N, Residual, &N, work);
double Anorm = dlange_(&NORM, &N, &N, A1, &N, work);
#ifdef VERBOSE
printf("> - ||L'L-A||_oo/(||A||_oo.N.eps) = %e \n",Rnorm / (Anorm*N*eps));
#endif
const int info_factorization = isnan(Rnorm/(Anorm*N*eps)) || isinf(Rnorm/(Anorm*N*eps)) || (Rnorm/(Anorm*N*eps) > 60.0);
#ifdef VERBOSE
if ( info_factorization) printf("> - Factorization is suspicious!\n");
else printf("> - Factorization is CORRECT!\n");
#endif
free(Residual); free(L1); free(L2); free(work);
return info_factorization;
}
void initialize_matrix(const int n, const int ts, double *matrix)
{
#ifdef VERBOSE
printf("> Initializing matrix with random values...\n");
#endif
int ISEED[4] = {0,0,0,1};
int intONE=1;
for (int i = 0; i < n*n; i+=n) {
dlarnv_(&intONE, &ISEED[0], &n, &matrix[i]);
}
for (int i=0; i<n; i++) {
for (int j=0; j<n; j++) {
matrix[j*n+i] = matrix[j*n+i] + matrix[i*n+j];
matrix[i*n+j] = matrix[j*n+i];
}
}
// Diagonal values
for (int i = 0; i < n; i++) {
matrix[i*n+i] += (double) n;
}
}
void gather_block(const int N, const int ts, double *Alin, double *A)
{
for (int i = 0; i < ts; i++)
for (int j = 0; j < ts; j++) {
A[i*ts + j] = Alin[i*N + j];
}
}
void scatter_block(const int N, const int ts, double *A, double *Alin)
{
for (int i = 0; i < ts; i++)
for (int j = 0; j < ts; j++) {
Alin[i*N + j] = A[i*ts + j];
}
}
void convert_to_blocks(const int ts, const int DIM, const int N, double Alin[N][N], double *A[DIM][DIM])
{
#ifdef VERBOSE
printf("> Converting linear matrix to blocks...\n");
#endif
for (int i = 0; i < DIM; i++)
for (int j = 0; j < DIM; j++) {
gather_block ( N, ts, &Alin[i*ts][j*ts], A[i][j]);
}
}
void convert_to_linear(const int ts, const int DIM, const int N, double *A[DIM][DIM], double Alin[N][N])
{
#ifdef VERBOSE
printf("> Converting blocked matrix to linear...\n");
#endif
for (int i = 0; i < DIM; i++)
for (int j = 0; j < DIM; j++) {
scatter_block ( N, ts, A[i][j], (double *) &Alin[i*ts][j*ts]);
}
}
int main(int argc, char* argv[])
{
char *result[3] = {"n/a","pass","FAIL"};
const double eps = pow(2.0, -53);
// If number of arguments is not 4, print help
if ( argc != 4) {
printf( "%s: matrix_size block_size check[0|1]?\n", argv[0] );
exit( -1 );
}
const int n = atoi(argv[1]); // matrix size
const int ts = atoi(argv[2]); // tile size
int check = atoi(argv[3]); // check result?
// Compute number of tiles
const int nt = n / ts;
assert((nt*ts) == n); // tile size should divide size
// Allocate matrix
double * const matrix = (double *) malloc(n * n * sizeof(double));
assert(matrix != NULL);
// Initialize matrix
initialize_matrix(n, ts, matrix);
// Allocate original matrix, and duplicate it, for debugging purposes
double * const original_matrix = (double *) malloc(n * n * sizeof(double));
assert(original_matrix != NULL);
// Save a copy of the original matrix
for (int i = 0; i < n * n; i++ ) {
original_matrix[i] = matrix[i];
}
// Set version description: Indirect blocked matrix
const char *version = "I-Blocked matrix";
// Allocate blocked matrix
double *Ah[nt][nt];
for (int i = 0; i < nt; i++) {
for (int j = 0; j < nt; j++) {
Ah[i][j] = malloc(ts * ts * sizeof(double));
assert(Ah[i][j] != NULL);
}
}
// ---------------------------------------
// Convert, compute (time), and re-convert
// ---------------------------------------
convert_to_blocks(ts, nt, n, (double(*)[n]) matrix, Ah);
const float tref = get_time();
cholesky(ts, nt, (double* (*)[nt]) Ah);
const float time = get_time() - tref;
convert_to_linear(ts, nt, n, Ah, (double (*)[n]) matrix);
// Free blocked matrix
for (int i = 0; i < nt; i++) {
for (int j = 0; j < nt; j++) {
assert(Ah[i][j] != NULL);
free(Ah[i][j]);
}
}
// Check result, if requested
if ( check ) {
const char uplo = 'L';
if ( check_factorization( n, original_matrix, matrix, n, uplo, eps) ) check++;
}
// Free original matrix, not needed anymore
free(original_matrix);
// Compute GFLOPs (Not verified)
float gflops = (((1.0 / 3.0) * n * n * n) / ((time) * 1.0e+9));
// Print results
#ifdef VERBOSE
printf( "\n" );
printf( "============ CHOLESKY RESULTS ============\n" );
printf( " test %s\n", argv[0]);
printf( " version %s\n", version);
printf( " matrix size: %dx%d\n", n, n);
printf( " tile size: %dx%d\n", ts, ts);
printf( " number of threads: %d\n", omp_get_max_threads());
printf( " time (s): %f\n", time);
printf( " performance (gflops): %f\n", gflops);
printf( " check: %s\n", result[check]);
printf( "==========================================\n" );
#else
printf("test, %s, version, %s, n, %d, ts, %d, num_threads, %d, gflops, %f, time, %f, check, %s\n", argv[0], version, n, ts, omp_get_max_threads(), gflops, time, result[check]);
#endif
// Free matrix
free(matrix);
return 0;
}

View file

@ -0,0 +1,23 @@
BIN_DIR=../bin
PROGRAM=$(BIN_DIR)/cholesky-for
MKL_HOME=$(MKLROOT)
CC=gcc
CFLAGS=-O3 -std=gnu99 -I$(MKL_HOME)/include -fopenmp
LDFLAGS=-L$(MKL_HOME)/lib/intel64 -lmkl_sequential -lmkl_core -lmkl_rt -lpthread -lm -fopenmp
#VERBOSE=-DVERBOSE
all: $(PROGRAM)
$(PROGRAM): cholesky.c
$(CC) $(CFLAGS) $(VERBOSE) -o $@ $^ $(LDFLAGS)
$(BIN_DIR):
mkdir $@
clean:
rm -rf $(PROGRAM) *.o
wipe: clean
rm -rf *.out *.err

View file

@ -0,0 +1,289 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <assert.h>
#include <sys/time.h>
#include <sys/times.h>
#include <math.h>
#include <mkl.h>
#include "omp.h"
#if !defined(_OPENMP)
int omp_get_max_threads() { return 1; }
int omp_get_num_threads() { return 1; }
#endif
void cholesky(int ts, int nt, double* Ah[nt][nt])
{
#ifdef VERBOSE
printf("> Computing Cholesky Factorization: indirect blocked matrix...\n");
#endif
for (int k = 0; k < nt; k++) {
// Diagonal Block factorization: using LAPACK
LAPACKE_dpotrf(LAPACK_COL_MAJOR, 'L', ts, Ah[k][k], ts);
// Triangular systems
for (int i = k + 1; i < nt; i++) {
cblas_dtrsm(CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, ts, ts, 1.0, Ah[k][k], ts, Ah[k][i], ts);
}
// Update trailing matrix
for (int i = k + 1; i < nt; i++) {
for (int j = k + 1; j < i; j++) {
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, ts, ts, ts, -1.0, Ah[k][i], ts, Ah[k][j], ts, 1.0, Ah[j][i], ts);
}
cblas_dsyrk(CblasColMajor, CblasLower, CblasNoTrans, ts, ts, -1.0, Ah[k][i], ts, 1.0, Ah[i][i], ts);
}
}
#ifdef VERBOSE
printf("> ...end of Cholesky Factorization.\n");
#endif
}
float get_time()
{
static double gtod_ref_time_sec = 0.0;
struct timeval tv;
gettimeofday(&tv, NULL);
// If this is the first invocation of through dclock(), then initialize the
// "reference time" global variable to the seconds field of the tv struct.
if (gtod_ref_time_sec == 0.0) gtod_ref_time_sec = (double) tv.tv_sec;
// Normalize the seconds field of the tv struct so that it is relative to the
// "reference time" that was recorded during the first invocation of dclock().
const double norm_sec = (double) tv.tv_sec - gtod_ref_time_sec;
// Compute the number of seconds since the reference time.
const double t = norm_sec + tv.tv_usec * 1.0e-6;
return (float) t;
}
// Robust Check the factorization of the matrix A2
// Using directly Fortran services: dlacpy_, dtrmm, dlange_
static int check_factorization(int N, double *A1, double *A2, int LDA, char uplo, double eps)
{
#ifdef VERBOSE
printf("> Checking the Cholesky Factorization... \n");
#endif
char NORM = 'I', ALL = 'A', UP = 'U', LO = 'L', TR = 'T', NU = 'N', RI = 'R';
double *Residual = (double *) malloc(N*N*sizeof(double));
double *L1 = (double *) malloc(N*N*sizeof(double));
double *L2 = (double *) malloc(N*N*sizeof(double));
double *work = (double *) malloc(N*sizeof(double));
memset((void*)L1, 0, N*N*sizeof(double));
memset((void*)L2, 0, N*N*sizeof(double));
double alpha= 1.0;
dlacpy_(&ALL, &N, &N, A1, &LDA, Residual, &N);
/* Dealing with L'L or U'U */
if (uplo == 'U'){
dlacpy_(&UP, &N, &N, A2, &LDA, L1, &N);
dlacpy_(&UP, &N, &N, A2, &LDA, L2, &N);
dtrmm(&LO, &uplo, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
} else{
dlacpy_(&LO, &N, &N, A2, &LDA, L1, &N);
dlacpy_(&LO, &N, &N, A2, &LDA, L2, &N);
dtrmm(&RI, &LO, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
}
/* Compute the Residual || A -L'L|| */
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
Residual[j*N+i] = L2[j*N+i] - Residual[j*N+i];
double Rnorm = dlange_(&NORM, &N, &N, Residual, &N, work);
double Anorm = dlange_(&NORM, &N, &N, A1, &N, work);
#ifdef VERBOSE
printf("> - ||L'L-A||_oo/(||A||_oo.N.eps) = %e \n",Rnorm / (Anorm*N*eps));
#endif
const int info_factorization = isnan(Rnorm/(Anorm*N*eps)) || isinf(Rnorm/(Anorm*N*eps)) || (Rnorm/(Anorm*N*eps) > 60.0);
#ifdef VERBOSE
if ( info_factorization) printf("> - Factorization is suspicious!\n");
else printf("> - Factorization is CORRECT!\n");
#endif
free(Residual); free(L1); free(L2); free(work);
return info_factorization;
}
void initialize_matrix(const int n, const int ts, double *matrix)
{
#ifdef VERBOSE
printf("> Initializing matrix with random values...\n");
#endif
int ISEED[4] = {0,0,0,1};
int intONE=1;
for (int i = 0; i < n*n; i+=n) {
dlarnv_(&intONE, &ISEED[0], &n, &matrix[i]);
}
for (int i=0; i<n; i++) {
for (int j=0; j<n; j++) {
matrix[j*n+i] = matrix[j*n+i] + matrix[i*n+j];
matrix[i*n+j] = matrix[j*n+i];
}
}
// Diagonal values
for (int i = 0; i < n; i++) {
matrix[i*n+i] += (double) n;
}
}
void gather_block(const int N, const int ts, double *Alin, double *A)
{
for (int i = 0; i < ts; i++)
for (int j = 0; j < ts; j++) {
A[i*ts + j] = Alin[i*N + j];
}
}
void scatter_block(const int N, const int ts, double *A, double *Alin)
{
for (int i = 0; i < ts; i++)
for (int j = 0; j < ts; j++) {
Alin[i*N + j] = A[i*ts + j];
}
}
void convert_to_blocks(const int ts, const int DIM, const int N, double Alin[N][N], double *A[DIM][DIM])
{
#ifdef VERBOSE
printf("> Converting linear matrix to blocks...\n");
#endif
for (int i = 0; i < DIM; i++)
for (int j = 0; j < DIM; j++) {
gather_block ( N, ts, &Alin[i*ts][j*ts], A[i][j]);
}
}
void convert_to_linear(const int ts, const int DIM, const int N, double *A[DIM][DIM], double Alin[N][N])
{
#ifdef VERBOSE
printf("> Converting blocked matrix to linear...\n");
#endif
for (int i = 0; i < DIM; i++)
for (int j = 0; j < DIM; j++) {
scatter_block ( N, ts, A[i][j], (double *) &Alin[i*ts][j*ts]);
}
}
int main(int argc, char* argv[])
{
char *result[3] = {"n/a","pass","FAIL"};
const double eps = pow(2.0, -53);
// If number of arguments is not 4, print help
if ( argc != 4) {
printf( "%s: matrix_size block_size check[0|1]?\n", argv[0] );
exit( -1 );
}
const int n = atoi(argv[1]); // matrix size
const int ts = atoi(argv[2]); // tile size
int check = atoi(argv[3]); // check result?
// Compute number of tiles
const int nt = n / ts;
assert((nt*ts) == n); // tile size should divide size
// Allocate matrix
double * const matrix = (double *) malloc(n * n * sizeof(double));
assert(matrix != NULL);
// Initialize matrix
initialize_matrix(n, ts, matrix);
// Allocate original matrix, and duplicate it, for debugging purposes
double * const original_matrix = (double *) malloc(n * n * sizeof(double));
assert(original_matrix != NULL);
// Save a copy of the original matrix
for (int i = 0; i < n * n; i++ ) {
original_matrix[i] = matrix[i];
}
// Set version description: Indirect blocked matrix
const char *version = "I-Blocked matrix";
// Allocate blocked matrix
double *Ah[nt][nt];
for (int i = 0; i < nt; i++) {
for (int j = 0; j < nt; j++) {
Ah[i][j] = malloc(ts * ts * sizeof(double));
assert(Ah[i][j] != NULL);
}
}
// ---------------------------------------
// Convert, compute (time), and re-convert
// ---------------------------------------
convert_to_blocks(ts, nt, n, (double(*)[n]) matrix, Ah);
const float tref = get_time();
cholesky(ts, nt, (double* (*)[nt]) Ah);
const float time = get_time() - tref;
convert_to_linear(ts, nt, n, Ah, (double (*)[n]) matrix);
// Free blocked matrix
for (int i = 0; i < nt; i++) {
for (int j = 0; j < nt; j++) {
assert(Ah[i][j] != NULL);
free(Ah[i][j]);
}
}
// Check result, if requested
if ( check ) {
const char uplo = 'L';
if ( check_factorization( n, original_matrix, matrix, n, uplo, eps) ) check++;
}
// Free original matrix, not needed anymore
free(original_matrix);
// Compute GFLOPs (Not verified)
float gflops = (((1.0 / 3.0) * n * n * n) / ((time) * 1.0e+9));
// Print results
#ifdef VERBOSE
printf( "\n" );
printf( "============ CHOLESKY RESULTS ============\n" );
printf( " test %s\n", argv[0]);
printf( " version %s\n", version);
printf( " matrix size: %dx%d\n", n, n);
printf( " tile size: %dx%d\n", ts, ts);
printf( " number of threads: %d\n", omp_get_max_threads());
printf( " time (s): %f\n", time);
printf( " performance (gflops): %f\n", gflops);
printf( " check: %s\n", result[check]);
printf( "==========================================\n" );
#else
printf("test, %s, version, %s, n, %d, ts, %d, num_threads, %d, gflops, %f, time, %f, check, %s\n", argv[0], version, n, ts, omp_get_max_threads(), gflops, time, result[check]);
#endif
// Free matrix
free(matrix);
return 0;
}

View file

@ -0,0 +1,291 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <assert.h>
#include <sys/time.h>
#include <sys/times.h>
#include <math.h>
#include <mkl.h>
#include "omp.h"
#if !defined(_OPENMP)
int omp_get_max_threads() { return 1; }
int omp_get_num_threads() { return 1; }
#endif
void cholesky(int ts, int nt, double* Ah[nt][nt])
{
#ifdef VERBOSE
printf("> Computing Cholesky Factorization: indirect blocked matrix...\n");
#endif
for (int k = 0; k < nt; k++) {
// Diagonal Block factorization: using LAPACK
LAPACKE_dpotrf(LAPACK_COL_MAJOR, 'L', ts, Ah[k][k], ts);
// Triangular systems
#pragma omp parallel for
for (int i = k + 1; i < nt; i++) {
cblas_dtrsm(CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, ts, ts, 1.0, Ah[k][k], ts, Ah[k][i], ts);
}
// Update trailing matrix
#pragma omp parallel for
for (int i = k + 1; i < nt; i++) {
for (int j = k + 1; j < i; j++) {
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, ts, ts, ts, -1.0, Ah[k][i], ts, Ah[k][j], ts, 1.0, Ah[j][i], ts);
}
cblas_dsyrk(CblasColMajor, CblasLower, CblasNoTrans, ts, ts, -1.0, Ah[k][i], ts, 1.0, Ah[i][i], ts);
}
}
#ifdef VERBOSE
printf("> ...end of Cholesky Factorization.\n");
#endif
}
float get_time()
{
static double gtod_ref_time_sec = 0.0;
struct timeval tv;
gettimeofday(&tv, NULL);
// If this is the first invocation of through dclock(), then initialize the
// "reference time" global variable to the seconds field of the tv struct.
if (gtod_ref_time_sec == 0.0) gtod_ref_time_sec = (double) tv.tv_sec;
// Normalize the seconds field of the tv struct so that it is relative to the
// "reference time" that was recorded during the first invocation of dclock().
const double norm_sec = (double) tv.tv_sec - gtod_ref_time_sec;
// Compute the number of seconds since the reference time.
const double t = norm_sec + tv.tv_usec * 1.0e-6;
return (float) t;
}
// Robust Check the factorization of the matrix A2
// Using directly Fortran services: dlacpy_, dtrmm, dlange_
static int check_factorization(int N, double *A1, double *A2, int LDA, char uplo, double eps)
{
#ifdef VERBOSE
printf("> Checking the Cholesky Factorization... \n");
#endif
char NORM = 'I', ALL = 'A', UP = 'U', LO = 'L', TR = 'T', NU = 'N', RI = 'R';
double *Residual = (double *) malloc(N*N*sizeof(double));
double *L1 = (double *) malloc(N*N*sizeof(double));
double *L2 = (double *) malloc(N*N*sizeof(double));
double *work = (double *) malloc(N*sizeof(double));
memset((void*)L1, 0, N*N*sizeof(double));
memset((void*)L2, 0, N*N*sizeof(double));
double alpha= 1.0;
dlacpy_(&ALL, &N, &N, A1, &LDA, Residual, &N);
/* Dealing with L'L or U'U */
if (uplo == 'U'){
dlacpy_(&UP, &N, &N, A2, &LDA, L1, &N);
dlacpy_(&UP, &N, &N, A2, &LDA, L2, &N);
dtrmm(&LO, &uplo, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
} else{
dlacpy_(&LO, &N, &N, A2, &LDA, L1, &N);
dlacpy_(&LO, &N, &N, A2, &LDA, L2, &N);
dtrmm(&RI, &LO, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
}
/* Compute the Residual || A -L'L|| */
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
Residual[j*N+i] = L2[j*N+i] - Residual[j*N+i];
double Rnorm = dlange_(&NORM, &N, &N, Residual, &N, work);
double Anorm = dlange_(&NORM, &N, &N, A1, &N, work);
#ifdef VERBOSE
printf("> - ||L'L-A||_oo/(||A||_oo.N.eps) = %e \n",Rnorm / (Anorm*N*eps));
#endif
const int info_factorization = isnan(Rnorm/(Anorm*N*eps)) || isinf(Rnorm/(Anorm*N*eps)) || (Rnorm/(Anorm*N*eps) > 60.0);
#ifdef VERBOSE
if ( info_factorization) printf("> - Factorization is suspicious!\n");
else printf("> - Factorization is CORRECT!\n");
#endif
free(Residual); free(L1); free(L2); free(work);
return info_factorization;
}
void initialize_matrix(const int n, const int ts, double *matrix)
{
#ifdef VERBOSE
printf("> Initializing matrix with random values...\n");
#endif
int ISEED[4] = {0,0,0,1};
int intONE=1;
for (int i = 0; i < n*n; i+=n) {
dlarnv_(&intONE, &ISEED[0], &n, &matrix[i]);
}
for (int i=0; i<n; i++) {
for (int j=0; j<n; j++) {
matrix[j*n+i] = matrix[j*n+i] + matrix[i*n+j];
matrix[i*n+j] = matrix[j*n+i];
}
}
// Diagonal values
for (int i = 0; i < n; i++) {
matrix[i*n+i] += (double) n;
}
}
void gather_block(const int N, const int ts, double *Alin, double *A)
{
for (int i = 0; i < ts; i++)
for (int j = 0; j < ts; j++) {
A[i*ts + j] = Alin[i*N + j];
}
}
void scatter_block(const int N, const int ts, double *A, double *Alin)
{
for (int i = 0; i < ts; i++)
for (int j = 0; j < ts; j++) {
Alin[i*N + j] = A[i*ts + j];
}
}
void convert_to_blocks(const int ts, const int DIM, const int N, double Alin[N][N], double *A[DIM][DIM])
{
#ifdef VERBOSE
printf("> Converting linear matrix to blocks...\n");
#endif
for (int i = 0; i < DIM; i++)
for (int j = 0; j < DIM; j++) {
gather_block ( N, ts, &Alin[i*ts][j*ts], A[i][j]);
}
}
void convert_to_linear(const int ts, const int DIM, const int N, double *A[DIM][DIM], double Alin[N][N])
{
#ifdef VERBOSE
printf("> Converting blocked matrix to linear...\n");
#endif
for (int i = 0; i < DIM; i++)
for (int j = 0; j < DIM; j++) {
scatter_block ( N, ts, A[i][j], (double *) &Alin[i*ts][j*ts]);
}
}
int main(int argc, char* argv[])
{
char *result[3] = {"n/a","pass","FAIL"};
const double eps = pow(2.0, -53);
// If number of arguments is not 4, print help
if ( argc != 4) {
printf( "%s: matrix_size block_size check[0|1]?\n", argv[0] );
exit( -1 );
}
const int n = atoi(argv[1]); // matrix size
const int ts = atoi(argv[2]); // tile size
int check = atoi(argv[3]); // check result?
// Compute number of tiles
const int nt = n / ts;
assert((nt*ts) == n); // tile size should divide size
// Allocate matrix
double * const matrix = (double *) malloc(n * n * sizeof(double));
assert(matrix != NULL);
// Initialize matrix
initialize_matrix(n, ts, matrix);
// Allocate original matrix, and duplicate it, for debugging purposes
double * const original_matrix = (double *) malloc(n * n * sizeof(double));
assert(original_matrix != NULL);
// Save a copy of the original matrix
for (int i = 0; i < n * n; i++ ) {
original_matrix[i] = matrix[i];
}
// Set version description: Indirect blocked matrix
const char *version = "I-Blocked matrix";
// Allocate blocked matrix
double *Ah[nt][nt];
for (int i = 0; i < nt; i++) {
for (int j = 0; j < nt; j++) {
Ah[i][j] = malloc(ts * ts * sizeof(double));
assert(Ah[i][j] != NULL);
}
}
// ---------------------------------------
// Convert, compute (time), and re-convert
// ---------------------------------------
convert_to_blocks(ts, nt, n, (double(*)[n]) matrix, Ah);
const float tref = get_time();
cholesky(ts, nt, (double* (*)[nt]) Ah);
const float time = get_time() - tref;
convert_to_linear(ts, nt, n, Ah, (double (*)[n]) matrix);
// Free blocked matrix
for (int i = 0; i < nt; i++) {
for (int j = 0; j < nt; j++) {
assert(Ah[i][j] != NULL);
free(Ah[i][j]);
}
}
// Check result, if requested
if ( check ) {
const char uplo = 'L';
if ( check_factorization( n, original_matrix, matrix, n, uplo, eps) ) check++;
}
// Free original matrix, not needed anymore
free(original_matrix);
// Compute GFLOPs (Not verified)
float gflops = (((1.0 / 3.0) * n * n * n) / ((time) * 1.0e+9));
// Print results
#ifdef VERBOSE
printf( "\n" );
printf( "============ CHOLESKY RESULTS ============\n" );
printf( " test %s\n", argv[0]);
printf( " version %s\n", version);
printf( " matrix size: %dx%d\n", n, n);
printf( " tile size: %dx%d\n", ts, ts);
printf( " number of threads: %d\n", omp_get_max_threads());
printf( " time (s): %f\n", time);
printf( " performance (gflops): %f\n", gflops);
printf( " check: %s\n", result[check]);
printf( "==========================================\n" );
#else
printf("test, %s, version, %s, n, %d, ts, %d, num_threads, %d, gflops, %f, time, %f, check, %s\n", argv[0], version, n, ts, omp_get_max_threads(), gflops, time, result[check]);
#endif
// Free matrix
free(matrix);
return 0;
}

View file

@ -0,0 +1,23 @@
BIN_DIR=../bin
PROGRAM=$(BIN_DIR)/matmul
MKL_HOME=$(MKLROOT)
CC=gcc
CFLAGS=-O3 -std=gnu99 -fopenmp
LDFLAGS=-fopenmp
#VERBOSE=-DVERBOSE
all: $(PROGRAM)
$(PROGRAM): matmul.c
$(CC) $(CFLAGS) $(VERBOSE) -o $@ $^ $(LDFLAGS)
$(BIN_DIR):
mkdir $@
clean:
rm -rf $(PROGRAM) *.o
wipe: clean
rm -rf *.out *.err

View file

@ -0,0 +1,106 @@
//===-- matmul.c - Different implementations of matrix multiplies -*- C -*-===//
//
// Part of the LOMP Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#define DUMP_MATRIX 0
void matmul_seq(double * C, double * A, double * B, size_t n) {
for (size_t i = 0; i < n; ++i) {
for (size_t k = 0; k < n; ++k) {
for (size_t j = 0; j < n; ++j) {
C[i * n + j] += A[i * n + k] * B[k * n + j];
}
}
}
}
void matmul_par(double * C, double * A, double * B, size_t n) {
for (size_t i = 0; i < n; ++i) {
for (size_t k = 0; k < n; ++k) {
for (size_t j = 0; j < n; ++j) {
C[i * n + j] += A[i * n + k] * B[k * n + j];
}
}
}
}
void init_mat(double * C, double * A, double * B, size_t n) {
for (size_t i = 0; i < n; ++i) {
for (size_t j = 0; j < n; ++j) {
C[i * n + j] = 0.0;
A[i * n + j] = 0.5;
B[i * n + j] = 0.25;
}
}
}
void dump_mat(double * mtx, size_t n) {
for (size_t i = 0; i < n; ++i) {
for (size_t j = 0; j < n; ++j) {
printf("%f ", mtx[i * n + j]);
}
printf("\n");
}
}
double sum_mat(double * mtx, size_t n) {
double sum = 0.0;
for (size_t i = 0; i < n; ++i) {
for (size_t j = 0; j < n; ++j) {
sum += mtx[i * n + j];
}
}
return sum;
}
int main(int argc, char * argv[]) {
double ts, te;
double t_seq;
double * C;
double * A;
double * B;
// If number of arguments is not 1, print help
if (argc != 2) {
printf("%s: matrix_size\n", argv[0]);
return EXIT_FAILURE;
}
const int n = atoi(argv[1]); // matrix size
C = (double *)malloc(sizeof(*C) * n * n);
A = (double *)malloc(sizeof(*A) * n * n);
B = (double *)malloc(sizeof(*B) * n * n);
init_mat(C, A, B, n);
ts = omp_get_wtime();
matmul_seq(C, A, B, n);
te = omp_get_wtime();
#if DUMP_MATRIX
dump_mat(C, n);
#endif
t_seq = te - ts;
printf("Sum of matrix (serial): %f, wall time %lf, speed-up %.2lf\n",
sum_mat(C, n), (te - ts), t_seq / (te - ts));
init_mat(C, A, B, n);
ts = omp_get_wtime();
matmul_par(C, A, B, n);
te = omp_get_wtime();
#if DUMP_MATRIX
dump_mat(C, n);
#endif
printf("Sum of matrix (parallel): %f, wall time %lf, speed-up %.2lf\n",
sum_mat(C, n), (te - ts), t_seq / (te - ts));
return EXIT_SUCCESS;
}

View file

@ -0,0 +1,108 @@
//===-- matmul.c - Different implementations of matrix multiplies -*- C -*-===//
//
// Part of the LOMP Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#define DUMP_MATRIX 0
void matmul_seq(double * C, double * A, double * B, size_t n) {
for (size_t i = 0; i < n; ++i) {
for (size_t k = 0; k < n; ++k) {
for (size_t j = 0; j < n; ++j) {
C[i * n + j] += A[i * n + k] * B[k * n + j];
}
}
}
}
void matmul_par(double * C, double * A, double * B, size_t n) {
#pragma omp parallel for shared(A,B,C) firstprivate(n) \
schedule(static) // collapse(2)
for (size_t i = 0; i < n; ++i) {
for (size_t k = 0; k < n; ++k) {
for (size_t j = 0; j < n; ++j) {
C[i * n + j] += A[i * n + k] * B[k * n + j];
}
}
}
}
void init_mat(double * C, double * A, double * B, size_t n) {
for (size_t i = 0; i < n; ++i) {
for (size_t j = 0; j < n; ++j) {
C[i * n + j] = 0.0;
A[i * n + j] = 0.5;
B[i * n + j] = 0.25;
}
}
}
void dump_mat(double * mtx, size_t n) {
for (size_t i = 0; i < n; ++i) {
for (size_t j = 0; j < n; ++j) {
printf("%f ", mtx[i * n + j]);
}
printf("\n");
}
}
double sum_mat(double * mtx, size_t n) {
double sum = 0.0;
for (size_t i = 0; i < n; ++i) {
for (size_t j = 0; j < n; ++j) {
sum += mtx[i * n + j];
}
}
return sum;
}
int main(int argc, char * argv[]) {
double ts, te;
double t_seq;
double * C;
double * A;
double * B;
// If number of arguments is not 1, print help
if (argc != 2) {
printf("%s: matrix_size\n", argv[0]);
return EXIT_FAILURE;
}
const int n = atoi(argv[1]); // matrix size
C = (double *)malloc(sizeof(*C) * n * n);
A = (double *)malloc(sizeof(*A) * n * n);
B = (double *)malloc(sizeof(*B) * n * n);
init_mat(C, A, B, n);
ts = omp_get_wtime();
matmul_seq(C, A, B, n);
te = omp_get_wtime();
#if DUMP_MATRIX
dump_mat(C, n);
#endif
t_seq = te - ts;
printf("Sum of matrix (serial): %f, wall time %lf, speed-up %.2lf\n",
sum_mat(C, n), (te - ts), t_seq / (te - ts));
init_mat(C, A, B, n);
ts = omp_get_wtime();
matmul_par(C, A, B, n);
te = omp_get_wtime();
#if DUMP_MATRIX
dump_mat(C, n);
#endif
printf("Sum of matrix (parallel): %f, wall time %lf, speed-up %.2lf\n",
sum_mat(C, n), (te - ts), t_seq / (te - ts));
return EXIT_SUCCESS;
}

View file

@ -0,0 +1,21 @@
BIN_DIR=../bin
PROGRAM=$(BIN_DIR)/sin-cos
CC=gcc
CFLAGS=-O3 -std=gnu99 -fopenmp
LDFLAGS=-lm -fopenmp
#VERBOSE=-DVERBOSE
all: $(PROGRAM)
$(PROGRAM): sin-cos.c
$(CC) $(CFLAGS) $(VERBOSE) -o $@ $^ $(LDFLAGS)
$(BIN_DIR):
mkdir $@
clean:
rm -rf $(PROGRAM) *.o
wipe: clean
rm -rf *.out *.err

View file

@ -0,0 +1,58 @@
/*
* OpenMP lecture exercises
* Copyright (C) 2011 by Christian Terboven <terboven@rz.rwth-aachen.de>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include <omp.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
double do_some_computation(int i)
{
double t = 0.0;
int j;
for (j = 0; j < i*i; j++)
{
t += sin((double)j) * cos((double)j);
}
return t;
}
int main(int argc, char* argv[])
{
const int dimension = 500;
int i;
double result = 0.0;
double t1 = omp_get_wtime();
for (i = 0; i < dimension; i++)
{
result += do_some_computation(i);
}
double t2 = omp_get_wtime();
printf("Computation took %.3lf seconds.\n", t2 - t1);
printf("Result is %.3lf.\n", result);
return 0;
}

View file

@ -0,0 +1,59 @@
/*
* OpenMP lecture exercises
* Copyright (C) 2011 by Christian Terboven <terboven@rz.rwth-aachen.de>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include <omp.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
double do_some_computation(int i)
{
double t = 0.0;
int j;
for (j = 0; j < i*i; j++)
{
t += sin((double)j) * cos((double)j);
}
return t;
}
int main(int argc, char* argv[])
{
const int dimension = 500;
int i;
double result = 0.0;
double t1 = omp_get_wtime();
#pragma omp parallel for schedule(dynamic) reduction(+:result)
for (i = 0; i < dimension; i++)
{
result += do_some_computation(i);
}
double t2 = omp_get_wtime();
printf("Computation took %.3lf seconds.\n", t2 - t1);
printf("Result is %.3lf.\n", result);
return 0;
}

14
01-tasks/Makefile Normal file
View file

@ -0,0 +1,14 @@
ALL_RECURSIVE=all-recursive clean-recursive
DIRS=$(shell ls -d *-* | sed '/_/d' 2>/dev/null)
all: $(CONFIG) all-recursive
clean: $(CONFIG) clean-recursive
$(ALL_RECURSIVE):
@failcom='exit 1';\
target=`echo $@ | sed s/-recursive//`; \
for subdir in $(DIRS); do \
(cd $$subdir && $(MAKE) $$target) || eval $$failcom; \
done;

0
01-tasks/bin/.gitdir Normal file
View file

28
01-tasks/bin/run-cholesky.sh Executable file
View file

@ -0,0 +1,28 @@
#!/bin/bash
#SBATCH --job-name=cholesky
#SBATCH --workdir=.
#SBATCH --output=cholesky_%j.out
#SBATCH --error=cholesky_%j.err
#SBATCH --cpus-per-task=48
#SBATCH --ntasks=1
#SBATCH --time=00:30:00
#SBATCH --qos=debug
export IFS=";"
THREADS="01;02;04;06;08;10;12;14;16;18;20;22;24;26;28;30;32;34;36;38;40;42;44;46;48"
#THREADS="01;02;04;06;08;10;12"
#MSIZES="2048;4096;8192;16384"
MSIZES="4096"
BSIZES="512"
for MS in $MSIZES; do
for BS in $BSIZES; do
for threads in $THREADS; do
OMP_NUM_THREADS=$threads ./cholesky-tsk $MS $BS 0
done
done
done

20
01-tasks/bin/run-sudoku.sh Executable file
View file

@ -0,0 +1,20 @@
#!/bin/bash
#SBATCH --job-name=sudoku
#SBATCH --workdir=.
#SBATCH --output=sudoku_%j.out
#SBATCH --error=sudoku_%j.err
#SBATCH --cpus-per-task=48
#SBATCH --ntasks=1
#SBATCH --time=00:30:00
#SBATCH --qos=debug
export IFS=";"
THREADS="01;02;04;06;08;10;12;14;16;18;20;22;24;26;28;30;32;34;36;38;40;42;44;46;48"
#THREADS="01;02;04;06;08;10;12"
for threads in $THREADS; do
OMP_NUM_THREADS=$threads ./sudoku-tsk 9 3 sudoku-9x9-1.txt
#OMP_NUM_THREADS=$threads ./sudoku-tsk 16 4 sudoku-16x16-1.txt
#OMP_NUM_THREADS=$threads ./sudoku-tsk 25 5 sudoku-25x25-1.txt
done

View file

@ -0,0 +1,16 @@
0 6 0 0 0 0 0 8 11 0 0 15 14 0 0 16
15 11 0 0 0 16 14 0 0 0 12 0 0 6 0 0
13 0 9 12 0 0 0 0 3 16 14 0 15 11 10 0
2 0 16 0 11 0 15 10 1 0 0 0 0 0 0 0
0 15 11 10 0 0 16 2 13 8 9 12 0 0 0 0
12 13 0 0 4 1 5 6 2 3 0 0 0 0 11 10
5 0 6 1 12 0 9 0 15 11 10 7 16 0 0 3
0 2 0 0 0 10 0 11 6 0 5 0 0 13 0 9
10 7 15 11 16 0 0 0 12 13 0 0 0 0 0 6
9 0 0 0 0 0 1 0 0 2 0 16 10 0 0 11
1 0 4 6 9 13 0 0 7 0 11 0 3 16 0 0
16 14 0 0 7 0 10 15 4 6 1 0 0 0 13 8
11 10 0 15 0 0 0 16 9 12 13 0 0 1 5 4
0 0 12 0 1 4 6 0 16 0 0 0 11 10 0 0
0 0 5 0 8 12 13 0 10 0 0 11 2 0 0 14
3 16 0 0 10 0 0 7 0 0 6 0 0 0 12 0

View file

@ -0,0 +1,25 @@
0 8 0 18 0 12 20 21 16 0 0 0 0 0 0 15 0 0 24 5 7 0 19 0 0
0 0 15 9 5 4 0 23 0 18 12 20 0 21 16 7 0 0 0 2 1 0 0 0 0
0 0 0 0 14 7 0 0 0 0 0 24 0 0 0 0 10 0 0 16 0 0 23 18 6
21 20 12 0 16 0 0 25 14 22 7 0 0 19 2 4 0 23 8 6 0 24 0 0 0
19 0 7 0 2 15 0 0 0 9 0 0 18 0 0 0 0 0 0 0 12 0 21 0 16
0 23 6 0 0 16 0 0 0 20 14 25 11 0 0 5 24 7 3 9 2 19 1 0 17
4 21 0 20 0 0 0 12 22 11 2 19 0 0 0 0 0 15 23 0 0 3 7 0 9
12 25 14 0 22 0 19 0 0 13 0 0 24 7 0 0 0 0 21 10 6 0 15 8 18
7 0 0 24 9 6 0 15 18 0 0 0 0 0 10 0 13 1 19 0 0 25 0 11 22
0 19 2 13 17 5 0 0 9 0 6 0 8 15 0 14 11 12 25 22 16 0 0 0 10
0 0 0 1 0 0 0 0 3 7 8 0 15 0 0 11 12 10 0 0 0 0 18 4 0
10 16 11 12 25 0 14 22 19 1 0 0 0 17 3 20 0 0 6 21 8 0 0 0 0
0 0 8 0 23 20 0 18 21 4 11 16 0 0 25 0 7 0 2 0 0 14 22 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 22 19 8 15 0 5 23 24 0 17 0 3
0 2 0 7 3 8 0 9 23 15 20 0 4 0 21 13 1 0 14 19 0 0 10 12 25
2 0 0 0 0 18 15 5 0 23 0 0 21 6 20 17 0 14 1 13 0 12 0 25 11
0 1 0 19 13 9 7 0 24 0 0 0 23 0 8 0 25 0 0 11 0 0 0 21 0
0 0 10 0 0 0 0 0 0 0 17 1 0 14 0 18 0 5 15 8 9 0 0 0 0
16 12 0 0 11 0 0 14 0 0 0 0 3 2 24 0 21 0 4 20 0 15 0 0 8
0 0 0 0 0 10 0 0 0 0 0 12 0 0 11 9 3 0 0 0 17 0 14 19 13
24 0 0 0 15 21 18 0 4 0 0 0 16 20 0 3 2 13 0 7 19 22 0 0 1
0 18 0 0 0 0 10 0 0 16 0 22 0 0 0 23 0 24 9 0 3 17 0 0 0
0 0 0 0 0 0 0 0 15 5 0 0 6 8 4 0 14 11 22 1 25 0 20 16 12
0 22 19 14 1 3 0 0 0 0 23 9 5 0 15 25 0 20 10 12 21 0 8 0 4
20 0 25 16 0 0 22 11 1 0 3 0 2 13 7 21 0 0 0 0 0 9 0 0 0

View file

@ -0,0 +1,10 @@
5 3 0 0 7 0 0 0 0
6 0 0 1 9 5 3 0 0
0 9 8 0 0 0 0 6 0
8 0 0 0 6 0 0 0 3
4 0 0 8 0 3 0 0 1
7 0 0 0 2 0 0 0 6
0 6 0 0 0 0 2 8 0
0 0 0 4 1 9 0 0 5
0 0 0 0 8 0 0 7 9

View file

@ -0,0 +1,23 @@
BIN_DIR=../bin
PROGRAM=$(BIN_DIR)/cholesky-tsk
MKL_HOME=$(MKLROOT)
CC=gcc
CFLAGS=-O3 -std=gnu99 -I$(MKL_HOME)/include -fopenmp
LDFLAGS=-L$(MKL_HOME)/lib/intel64 -lmkl_sequential -lmkl_core -lmkl_rt -lpthread -lm -fopenmp
VERBOSE=-DVERBOSE
all: $(PROGRAM)
$(PROGRAM): cholesky.c
$(CC) $(CFLAGS) $(VERBOSE) -o $@ $^ $(LDFLAGS)
$(BIN_DIR):
mkdir $@
clean:
rm -rf $(PROGRAM) *.o
wipe: clean
rm -rf *.out *.err

View file

@ -0,0 +1,296 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <assert.h>
#include <sys/time.h>
#include <sys/times.h>
#include <math.h>
#include <mkl.h>
#include "omp.h"
#if !defined(_OPENMP)
int omp_get_max_threads() { return 1; }
int omp_get_num_threads() { return 1; }
#endif
void cholesky(int ts, int nt, double* Ah[nt][nt])
{
#ifdef VERBOSE
printf("> Computing Cholesky Factorization: indirect blocked matrix...\n");
#endif
#pragma omp parallel
#pragma omp single
for (int k = 0; k < nt; k++) {
// Diagonal Block factorization: using LAPACK
LAPACKE_dpotrf(LAPACK_COL_MAJOR, 'L', ts, Ah[k][k], ts);
// Triangular systems
for (int i = k + 1; i < nt; i++) {
#pragma omp task
cblas_dtrsm(CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, ts, ts, 1.0, Ah[k][k], ts, Ah[k][i], ts);
}
#pragma omp taskwait
// Update trailing matrix
for (int i = k + 1; i < nt; i++) {
for (int j = k + 1; j < i; j++) {
#pragma omp task
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, ts, ts, ts, -1.0, Ah[k][i], ts, Ah[k][j], ts, 1.0, Ah[j][i], ts);
}
#pragma omp task
cblas_dsyrk(CblasColMajor, CblasLower, CblasNoTrans, ts, ts, -1.0, Ah[k][i], ts, 1.0, Ah[i][i], ts);
}
#pragma omp taskwait
}
#ifdef VERBOSE
printf("> ...end of Cholesky Factorization.\n");
#endif
}
float get_time()
{
static double gtod_ref_time_sec = 0.0;
struct timeval tv;
gettimeofday(&tv, NULL);
// If this is the first invocation of through dclock(), then initialize the
// "reference time" global variable to the seconds field of the tv struct.
if (gtod_ref_time_sec == 0.0) gtod_ref_time_sec = (double) tv.tv_sec;
// Normalize the seconds field of the tv struct so that it is relative to the
// "reference time" that was recorded during the first invocation of dclock().
const double norm_sec = (double) tv.tv_sec - gtod_ref_time_sec;
// Compute the number of seconds since the reference time.
const double t = norm_sec + tv.tv_usec * 1.0e-6;
return (float) t;
}
// Robust Check the factorization of the matrix A2
// Using directly Fortran services: dlacpy_, dtrmm, dlange_
static int check_factorization(int N, double *A1, double *A2, int LDA, char uplo, double eps)
{
#ifdef VERBOSE
printf("> Checking the Cholesky Factorization... \n");
#endif
char NORM = 'I', ALL = 'A', UP = 'U', LO = 'L', TR = 'T', NU = 'N', RI = 'R';
double *Residual = (double *) malloc(N*N*sizeof(double));
double *L1 = (double *) malloc(N*N*sizeof(double));
double *L2 = (double *) malloc(N*N*sizeof(double));
double *work = (double *) malloc(N*sizeof(double));
memset((void*)L1, 0, N*N*sizeof(double));
memset((void*)L2, 0, N*N*sizeof(double));
double alpha= 1.0;
dlacpy_(&ALL, &N, &N, A1, &LDA, Residual, &N);
/* Dealing with L'L or U'U */
if (uplo == 'U'){
dlacpy_(&UP, &N, &N, A2, &LDA, L1, &N);
dlacpy_(&UP, &N, &N, A2, &LDA, L2, &N);
dtrmm(&LO, &uplo, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
} else{
dlacpy_(&LO, &N, &N, A2, &LDA, L1, &N);
dlacpy_(&LO, &N, &N, A2, &LDA, L2, &N);
dtrmm(&RI, &LO, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
}
/* Compute the Residual || A -L'L|| */
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
Residual[j*N+i] = L2[j*N+i] - Residual[j*N+i];
double Rnorm = dlange_(&NORM, &N, &N, Residual, &N, work);
double Anorm = dlange_(&NORM, &N, &N, A1, &N, work);
#ifdef VERBOSE
printf("> - ||L'L-A||_oo/(||A||_oo.N.eps) = %e \n",Rnorm / (Anorm*N*eps));
#endif
const int info_factorization = isnan(Rnorm/(Anorm*N*eps)) || isinf(Rnorm/(Anorm*N*eps)) || (Rnorm/(Anorm*N*eps) > 60.0);
#ifdef VERBOSE
if ( info_factorization) printf("> - Factorization is suspicious!\n");
else printf("> - Factorization is CORRECT!\n");
#endif
free(Residual); free(L1); free(L2); free(work);
return info_factorization;
}
void initialize_matrix(const int n, const int ts, double *matrix)
{
#ifdef VERBOSE
printf("> Initializing matrix with random values...\n");
#endif
int ISEED[4] = {0,0,0,1};
int intONE=1;
for (int i = 0; i < n*n; i+=n) {
dlarnv_(&intONE, &ISEED[0], &n, &matrix[i]);
}
for (int i=0; i<n; i++) {
for (int j=0; j<n; j++) {
matrix[j*n+i] = matrix[j*n+i] + matrix[i*n+j];
matrix[i*n+j] = matrix[j*n+i];
}
}
// Diagonal values
for (int i = 0; i < n; i++) {
matrix[i*n+i] += (double) n;
}
}
void gather_block(const int N, const int ts, double *Alin, double *A)
{
for (int i = 0; i < ts; i++)
for (int j = 0; j < ts; j++) {
A[i*ts + j] = Alin[i*N + j];
}
}
void scatter_block(const int N, const int ts, double *A, double *Alin)
{
for (int i = 0; i < ts; i++)
for (int j = 0; j < ts; j++) {
Alin[i*N + j] = A[i*ts + j];
}
}
void convert_to_blocks(const int ts, const int DIM, const int N, double Alin[N][N], double *A[DIM][DIM])
{
#ifdef VERBOSE
printf("> Converting linear matrix to blocks...\n");
#endif
for (int i = 0; i < DIM; i++)
for (int j = 0; j < DIM; j++) {
gather_block ( N, ts, &Alin[i*ts][j*ts], A[i][j]);
}
}
void convert_to_linear(const int ts, const int DIM, const int N, double *A[DIM][DIM], double Alin[N][N])
{
#ifdef VERBOSE
printf("> Converting blocked matrix to linear...\n");
#endif
for (int i = 0; i < DIM; i++)
for (int j = 0; j < DIM; j++) {
scatter_block ( N, ts, A[i][j], (double *) &Alin[i*ts][j*ts]);
}
}
int main(int argc, char* argv[])
{
char *result[3] = {"n/a","pass","FAIL"};
const double eps = pow(2.0, -53);
// If number of arguments is not 4, print help
if ( argc != 4) {
printf( "%s: matrix_size block_size check[0|1]?\n", argv[0] );
exit( -1 );
}
const int n = atoi(argv[1]); // matrix size
const int ts = atoi(argv[2]); // tile size
int check = atoi(argv[3]); // check result?
// Compute number of tiles
const int nt = n / ts;
assert((nt*ts) == n); // tile size should divide size
// Allocate matrix
double * const matrix = (double *) malloc(n * n * sizeof(double));
assert(matrix != NULL);
// Initialize matrix
initialize_matrix(n, ts, matrix);
// Allocate original matrix, and duplicate it, for debugging purposes
double * const original_matrix = (double *) malloc(n * n * sizeof(double));
assert(original_matrix != NULL);
// Save a copy of the original matrix
for (int i = 0; i < n * n; i++ ) {
original_matrix[i] = matrix[i];
}
// Set version description: Indirect blocked matrix
const char *version = "I-Blocked matrix";
// Allocate blocked matrix
double *Ah[nt][nt];
for (int i = 0; i < nt; i++) {
for (int j = 0; j < nt; j++) {
Ah[i][j] = malloc(ts * ts * sizeof(double));
assert(Ah[i][j] != NULL);
}
}
// ---------------------------------------
// Convert, compute (time), and re-convert
// ---------------------------------------
convert_to_blocks(ts, nt, n, (double(*)[n]) matrix, Ah);
const float tref = get_time();
cholesky(ts, nt, (double* (*)[nt]) Ah);
const float time = get_time() - tref;
convert_to_linear(ts, nt, n, Ah, (double (*)[n]) matrix);
// Free blocked matrix
for (int i = 0; i < nt; i++) {
for (int j = 0; j < nt; j++) {
assert(Ah[i][j] != NULL);
free(Ah[i][j]);
}
}
// Check result, if requested
if ( check ) {
const char uplo = 'L';
if ( check_factorization( n, original_matrix, matrix, n, uplo, eps) ) check++;
}
// Free original matrix, not needed anymore
free(original_matrix);
// Compute GFLOPs (Not verified)
float gflops = (((1.0 / 3.0) * n * n * n) / ((time) * 1.0e+9));
// Print results
#ifdef VERBOSE
printf( "\n" );
printf( "============ CHOLESKY RESULTS ============\n" );
printf( " test %s\n", argv[0]);
printf( " version %s\n", version);
printf( " matrix size: %dx%d\n", n, n);
printf( " tile size: %dx%d\n", ts, ts);
printf( " number of threads: %d\n", omp_get_max_threads());
printf( " time (s): %f\n", time);
printf( " performance (gflops): %f\n", gflops);
printf( " check: %s\n", result[check]);
printf( "==========================================\n" );
#else
printf("test, %s, version, %s, n, %d, ts, %d, num_threads, %d, gflops, %f, time, %f, check, %s\n", argv[0], version, n, ts, omp_get_max_threads(), gflops, time, result[check]);
#endif
// Free matrix
free(matrix);
return 0;
}

View file

@ -0,0 +1,292 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <assert.h>
#include <sys/time.h>
#include <sys/times.h>
#include <math.h>
#include <mkl.h>
#include "omp.h"
#if !defined(_OPENMP)
int omp_get_max_threads() { return 1; }
int omp_get_num_threads() { return 1; }
#endif
void cholesky(int ts, int nt, double* Ah[nt][nt])
{
#ifdef VERBOSE
printf("> Computing Cholesky Factorization: indirect blocked matrix...\n");
#endif
// TODO: Parallel region is required in order to run in parallel.
// TODO: Kernel invocations could be task candidates.
// TODO: Add scheduler restrictions: taskwait, taskgroup, etc.
for (int k = 0; k < nt; k++) {
// Diagonal Block factorization: using LAPACK
LAPACKE_dpotrf(LAPACK_COL_MAJOR, 'L', ts, Ah[k][k], ts);
// Triangular systems
for (int i = k + 1; i < nt; i++) {
cblas_dtrsm(CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, ts, ts, 1.0, Ah[k][k], ts, Ah[k][i], ts);
}
// Update trailing matrix
for (int i = k + 1; i < nt; i++) {
for (int j = k + 1; j < i; j++) {
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, ts, ts, ts, -1.0, Ah[k][i], ts, Ah[k][j], ts, 1.0, Ah[j][i], ts);
}
cblas_dsyrk(CblasColMajor, CblasLower, CblasNoTrans, ts, ts, -1.0, Ah[k][i], ts, 1.0, Ah[i][i], ts);
}
}
#ifdef VERBOSE
printf("> ...end of Cholesky Factorization.\n");
#endif
}
float get_time()
{
static double gtod_ref_time_sec = 0.0;
struct timeval tv;
gettimeofday(&tv, NULL);
// If this is the first invocation of through dclock(), then initialize the
// "reference time" global variable to the seconds field of the tv struct.
if (gtod_ref_time_sec == 0.0) gtod_ref_time_sec = (double) tv.tv_sec;
// Normalize the seconds field of the tv struct so that it is relative to the
// "reference time" that was recorded during the first invocation of dclock().
const double norm_sec = (double) tv.tv_sec - gtod_ref_time_sec;
// Compute the number of seconds since the reference time.
const double t = norm_sec + tv.tv_usec * 1.0e-6;
return (float) t;
}
// Robust Check the factorization of the matrix A2
// Using directly Fortran services: dlacpy_, dtrmm, dlange_
static int check_factorization(int N, double *A1, double *A2, int LDA, char uplo, double eps)
{
#ifdef VERBOSE
printf("> Checking the Cholesky Factorization... \n");
#endif
char NORM = 'I', ALL = 'A', UP = 'U', LO = 'L', TR = 'T', NU = 'N', RI = 'R';
double *Residual = (double *) malloc(N*N*sizeof(double));
double *L1 = (double *) malloc(N*N*sizeof(double));
double *L2 = (double *) malloc(N*N*sizeof(double));
double *work = (double *) malloc(N*sizeof(double));
memset((void*)L1, 0, N*N*sizeof(double));
memset((void*)L2, 0, N*N*sizeof(double));
double alpha= 1.0;
dlacpy_(&ALL, &N, &N, A1, &LDA, Residual, &N);
/* Dealing with L'L or U'U */
if (uplo == 'U'){
dlacpy_(&UP, &N, &N, A2, &LDA, L1, &N);
dlacpy_(&UP, &N, &N, A2, &LDA, L2, &N);
dtrmm(&LO, &uplo, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
} else{
dlacpy_(&LO, &N, &N, A2, &LDA, L1, &N);
dlacpy_(&LO, &N, &N, A2, &LDA, L2, &N);
dtrmm(&RI, &LO, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
}
/* Compute the Residual || A -L'L|| */
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
Residual[j*N+i] = L2[j*N+i] - Residual[j*N+i];
double Rnorm = dlange_(&NORM, &N, &N, Residual, &N, work);
double Anorm = dlange_(&NORM, &N, &N, A1, &N, work);
#ifdef VERBOSE
printf("> - ||L'L-A||_oo/(||A||_oo.N.eps) = %e \n",Rnorm / (Anorm*N*eps));
#endif
const int info_factorization = isnan(Rnorm/(Anorm*N*eps)) || isinf(Rnorm/(Anorm*N*eps)) || (Rnorm/(Anorm*N*eps) > 60.0);
#ifdef VERBOSE
if ( info_factorization) printf("> - Factorization is suspicious!\n");
else printf("> - Factorization is CORRECT!\n");
#endif
free(Residual); free(L1); free(L2); free(work);
return info_factorization;
}
void initialize_matrix(const int n, const int ts, double *matrix)
{
#ifdef VERBOSE
printf("> Initializing matrix with random values...\n");
#endif
int ISEED[4] = {0,0,0,1};
int intONE=1;
for (int i = 0; i < n*n; i+=n) {
dlarnv_(&intONE, &ISEED[0], &n, &matrix[i]);
}
for (int i=0; i<n; i++) {
for (int j=0; j<n; j++) {
matrix[j*n+i] = matrix[j*n+i] + matrix[i*n+j];
matrix[i*n+j] = matrix[j*n+i];
}
}
// Diagonal values
for (int i = 0; i < n; i++) {
matrix[i*n+i] += (double) n;
}
}
void gather_block(const int N, const int ts, double *Alin, double *A)
{
for (int i = 0; i < ts; i++)
for (int j = 0; j < ts; j++) {
A[i*ts + j] = Alin[i*N + j];
}
}
void scatter_block(const int N, const int ts, double *A, double *Alin)
{
for (int i = 0; i < ts; i++)
for (int j = 0; j < ts; j++) {
Alin[i*N + j] = A[i*ts + j];
}
}
void convert_to_blocks(const int ts, const int DIM, const int N, double Alin[N][N], double *A[DIM][DIM])
{
#ifdef VERBOSE
printf("> Converting linear matrix to blocks...\n");
#endif
for (int i = 0; i < DIM; i++)
for (int j = 0; j < DIM; j++) {
gather_block ( N, ts, &Alin[i*ts][j*ts], A[i][j]);
}
}
void convert_to_linear(const int ts, const int DIM, const int N, double *A[DIM][DIM], double Alin[N][N])
{
#ifdef VERBOSE
printf("> Converting blocked matrix to linear...\n");
#endif
for (int i = 0; i < DIM; i++)
for (int j = 0; j < DIM; j++) {
scatter_block ( N, ts, A[i][j], (double *) &Alin[i*ts][j*ts]);
}
}
int main(int argc, char* argv[])
{
char *result[3] = {"n/a","pass","FAIL"};
const double eps = pow(2.0, -53);
// If number of arguments is not 4, print help
if ( argc != 4) {
printf( "%s: matrix_size block_size check[0|1]?\n", argv[0] );
exit( -1 );
}
const int n = atoi(argv[1]); // matrix size
const int ts = atoi(argv[2]); // tile size
int check = atoi(argv[3]); // check result?
// Compute number of tiles
const int nt = n / ts;
assert((nt*ts) == n); // tile size should divide size
// Allocate matrix
double * const matrix = (double *) malloc(n * n * sizeof(double));
assert(matrix != NULL);
// Initialize matrix
initialize_matrix(n, ts, matrix);
// Allocate original matrix, and duplicate it, for debugging purposes
double * const original_matrix = (double *) malloc(n * n * sizeof(double));
assert(original_matrix != NULL);
// Save a copy of the original matrix
for (int i = 0; i < n * n; i++ ) {
original_matrix[i] = matrix[i];
}
// Set version description: Indirect blocked matrix
const char *version = "I-Blocked matrix";
// Allocate blocked matrix
double *Ah[nt][nt];
for (int i = 0; i < nt; i++) {
for (int j = 0; j < nt; j++) {
Ah[i][j] = malloc(ts * ts * sizeof(double));
assert(Ah[i][j] != NULL);
}
}
// ---------------------------------------
// Convert, compute (time), and re-convert
// ---------------------------------------
convert_to_blocks(ts, nt, n, (double(*)[n]) matrix, Ah);
const float tref = get_time();
cholesky(ts, nt, (double* (*)[nt]) Ah);
const float time = get_time() - tref;
convert_to_linear(ts, nt, n, Ah, (double (*)[n]) matrix);
// Free blocked matrix
for (int i = 0; i < nt; i++) {
for (int j = 0; j < nt; j++) {
assert(Ah[i][j] != NULL);
free(Ah[i][j]);
}
}
// Check result, if requested
if ( check ) {
const char uplo = 'L';
if ( check_factorization( n, original_matrix, matrix, n, uplo, eps) ) check++;
}
// Free original matrix, not needed anymore
free(original_matrix);
// Compute GFLOPs (Not verified)
float gflops = (((1.0 / 3.0) * n * n * n) / ((time) * 1.0e+9));
// Print results
#ifdef VERBOSE
printf( "\n" );
printf( "============ CHOLESKY RESULTS ============\n" );
printf( " test %s\n", argv[0]);
printf( " version %s\n", version);
printf( " matrix size: %dx%d\n", n, n);
printf( " tile size: %dx%d\n", ts, ts);
printf( " number of threads: %d\n", omp_get_max_threads());
printf( " time (s): %f\n", time);
printf( " performance (gflops): %f\n", gflops);
printf( " check: %s\n", result[check]);
printf( "==========================================\n" );
#else
printf("test, %s, version, %s, n, %d, ts, %d, num_threads, %d, gflops, %f, time, %f, check, %s\n", argv[0], version, n, ts, omp_get_max_threads(), gflops, time, result[check]);
#endif
// Free matrix
free(matrix);
return 0;
}

View file

@ -0,0 +1,25 @@
BIN_DIR=../bin
PROGRAM=$(BIN_DIR)/sudoku-tsk
CC=g++
CFLAGS=-c -O3 -fopenmp -DUSE_SEQUENTIAL_CUTOFF
LDFLAGS=-fopenmp
OBJS=SudokuBoard.o sudoku.o
all: $(PROGRAM)
$(PROGRAM): $(OBJS)
$(CC) -o $@ $^ $(LDFLAGS)
%.o: %.cpp
$(CC) $(CFLAGS) -o $@ $^
$(BIN_DIR):
mkdir $@
clean:
rm -rf $(PROGRAM) *.o
wipe: clean
rm -rf *.out *.err

View file

@ -0,0 +1,119 @@
/*
* This file is part of Christian's OpenMP Task-parallel Sudoku Solver
*
* Copyright (C) 2013 by Christian Terboven <christian@terboven.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "SudokuBoard.h"
#include <cstring>
CSudokuBoard::CSudokuBoard(int fsize, int bsize)
: field_size(fsize), block_size(bsize), solutions(-1)
{
field = new int[field_size*field_size];
}
CSudokuBoard::CSudokuBoard(const CSudokuBoard& other)
: field_size(other.getFieldSize()), block_size(other.getBlockSize()), solutions(other.getNumSolutions())
{
field = new int[field_size*field_size];
std::memcpy(field, other.field, sizeof(int) * field_size*field_size);
}
CSudokuBoard::~CSudokuBoard(void)
{
delete[] field;
}
bool CSudokuBoard::loadFromFile(char *filename)
{
std::ifstream ifile(filename);
if (!ifile) {
std::cout << "There was an error opening the input file " << filename << std::endl;
std::cout << std::endl;
return false;
}
for (int i = 0; i < this->field_size; i++) {
for (int j = 0; j < this->field_size; j++) {
ifile >> this->field[ACCESS(i,j)];
}
}
return true;
}
void CSudokuBoard::printBoard()
{
for(int i = 0; i < field_size; i++) {
for(int j = 0; j < field_size; j++) {
std::cout << std::setw(3) <<
this->field[ACCESS(i,j)]
<< " ";
}
std::cout << std::endl;
}
}
bool CSudokuBoard::check(int x, int y, int value)
{
if(checkHorizontal(y, value))
return true;
if(checkVertical(x, value))
return true;
if(checkBox(x, y, value))
return true;
return false;
}
bool CSudokuBoard::checkHorizontal(int y, int value) {
for(int i = 0; i < field_size; i++)
if(field[ACCESS(y,i)] == value)
return true;
return false;
}
bool CSudokuBoard::checkVertical(int x, int value) {
for(int i = 0; i < field_size; i++)
if(field[ACCESS(i,x)] == value)
return true;
return false;
}
bool CSudokuBoard::checkBox(int x, int y, int value) {
// find suitable box edge
int x_box = (int)(x / block_size) * block_size;
int y_box = (int)(y / block_size) * block_size;
for(int i = y_box; i < y_box + block_size; i++)
for(int j = x_box; j < x_box + block_size; j++)
if(field[ACCESS(i,j)] == value)
return true;
return false;
}

View file

@ -0,0 +1,96 @@
/*
* This file is part of Christian's OpenMP Task-parallel Sudoku Solver
*
* Copyright (C) 2013 by Christian Terboven <christian@terboven.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include <iostream>
#include <iomanip>
#include <fstream>
#include <vector>
#define ACCESS(x, y) (this->field_size*(x) + (y))
/**
* @brief Representation of the Sudoku board, includes solver and utility functions
*/
class CSudokuBoard
{
public:
CSudokuBoard(int fsize, int bsize);
CSudokuBoard(const CSudokuBoard& other);
~CSudokuBoard(void);
inline int getNumSolutions() const {
return this->solutions;
}
inline void incrementSolutionCounter() {
if (this->solutions == -1)
this->solutions = 1;
else
this->solutions++;
}
inline int getFieldSize() const {
return this->field_size;
}
inline int getBlockSize() const {
return this->block_size;
}
inline int get(int x, int y) const {
return this->field[ACCESS(x,y)];
}
inline void set(int x, int y, int value) {
this->field[ACCESS(x, y)] = value;
}
/**
* Read Sudoku template from file
* @param filename name of file to read input from
* @return true if reading file was successful, false if not
*/
bool loadFromFile(char *filename);
/**
* Print the Sudoku board to stdout
*/
void printBoard();
/**
* Check whether value already used
* @return true if already used, false if not
*/
bool check(int x, int y, int value);
private:
bool checkHorizontal(int y, int value);
bool checkVertical(int x, int value);
bool checkBox(int x, int y, int value);
int field_size;
int block_size;
int *field;
int solutions;
};

View file

@ -0,0 +1,177 @@
/*
* This file is part of Christian's OpenMP Task-parallel Sudoku Solver
*
* Copyright (C) 2013 by Christian Terboven <christian@terboven.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include "SudokuBoard.h"
int found_sudokus = 0;
/**
* Solve the Sudoku puzzle by finding and counting all solutions (if all are requested)
*/
bool solve(int x, int y, CSudokuBoard* sudoku, bool printAnyFoundSolution = true, bool findAllSolutions = true)
{
if(x == sudoku->getFieldSize()) { // end of line
y++;
x = 0;
if(y == sudoku->getFieldSize()) // end
return true;
}
if(sudoku->get(y, x) > 0) // field already set
return solve(x+1, y, sudoku, printAnyFoundSolution, findAllSolutions); // tackle next field
for(int i = 1; i <= sudoku->getFieldSize(); i++) { // try all numbers
if(!sudoku->check(x, y, i)) {
sudoku->set(y, x, i); // if number fits, set it
if(solve(x+1, y, sudoku, printAnyFoundSolution, findAllSolutions)) { // tackle next field
#pragma omp atomic
found_sudokus++;
sudoku->incrementSolutionCounter(); // solution found :-)
if (printAnyFoundSolution) {
std::cout << "The following is a valid solution:" << std::endl;
sudoku->printBoard();
std::cout << std::endl;
}
if (!findAllSolutions) {
return true; // return, as only one solution was asked for
}
}
}
}
sudoku->set(y, x, 0); // no solution found, reset field
return false;
}
/**
* Solve the Sudoku puzzle in parallel by finding and counting all solutions
*/
bool solve_parallel(int x, int y, CSudokuBoard* sudoku, bool printAnyFoundSolution = true)
{
if (x == sudoku->getFieldSize()) { // end of line
y++;
x = 0;
if(y == sudoku->getFieldSize()) // end
return true;
}
if (sudoku->get(y, x) > 0) { // field already set
return solve_parallel(x+1, y, sudoku, printAnyFoundSolution); // tackle next field
}
#if USE_SEQUENTIAL_CUTOFF
if ( y > 1 ) {
return solve(x, y, sudoku, printAnyFoundSolution);
}
#endif
for (int i = 1; i <= sudoku->getFieldSize(); i++) { // try all numbers
if (!sudoku->check(x, y, i)) {
#pragma omp task firstprivate(i,x,y,sudoku)
{
CSudokuBoard new_sudoku(*sudoku);
new_sudoku.set(y, x, i); // if number fits, set it
if (solve_parallel(x+1, y, &new_sudoku, printAnyFoundSolution)) { // tackle next field
#pragma omp atomic
found_sudokus++;
// sudoku->incrementSolutionCounter(); // solution found :-)
if (printAnyFoundSolution) {
std::cout << "The following is a valid solution:" << std::endl;
new_sudoku.printBoard();
std::cout << std::endl;
}
}
}
}
}
#pragma omp taskwait
sudoku->set(y, x, 0); // no solution found, reset field
return false;
}
/** @brief program entry point
*/
int main(int argc, char* argv[]) {
// measure the time
double t3, t4;
int nthreads = 0;
// expect three command line arguments: field size, block size, and input file
if (argc != 4) {
std::cout << "Usage: sudoku.exe <field size> <block size> <input filename>" << std::endl;
std::cout << std::endl;
return -1;
}
else {
CSudokuBoard *sudoku1 = new CSudokuBoard(atoi(argv[1]), atoi(argv[2]));
if (!sudoku1->loadFromFile(argv[3])) {
std::cout << "There was an error reading a Sudoku template from " << argv[3] << std::endl;
std::cout << std::endl;
return -1;
}
// print the Sudoku board template
std::cout << "Given Sudoku template" << std::endl;
sudoku1->printBoard();
// solve the Sudoku by finding (and printing) all solutions
t3 = omp_get_wtime();
#pragma omp parallel sections
{
solve_parallel(0, 0, sudoku1, false);
}
t4 = omp_get_wtime();
#pragma omp parallel
{
#pragma omp master
nthreads = omp_get_num_threads();
}
std::cout << std::endl;
std::cout << "In total there were " << sudoku1->getNumSolutions() << " solutions found in serial." << std::endl;
std::cout << "In total there were " << found_sudokus << " solutions found in parallel " << std::endl;
std::cout << std::endl;
delete sudoku1;
}
// print the time
std::cout << "Parallel computation took " << t4 - t3 << " seconds ("
<< nthreads << " threads)." << std::endl << std::endl;
return 0;
}

View file

@ -0,0 +1,169 @@
/*
* This file is part of Christian's OpenMP Task-parallel Sudoku Solver
*
* Copyright (C) 2013 by Christian Terboven <christian@terboven.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include "SudokuBoard.h"
int found_sudokus = 0;
/**
* Solve the Sudoku puzzle by finding and counting all solutions (if all are requested)
*/
bool solve(int x, int y, CSudokuBoard* sudoku, bool printAnyFoundSolution = true, bool findAllSolutions = true)
{
if(x == sudoku->getFieldSize()) { // end of line
y++;
x = 0;
if(y == sudoku->getFieldSize()) // end
return true;
}
if(sudoku->get(y, x) > 0) // field already set
return solve(x+1, y, sudoku, printAnyFoundSolution, findAllSolutions); // tackle next field
for(int i = 1; i <= sudoku->getFieldSize(); i++) { // try all numbers
if(!sudoku->check(x, y, i)) {
sudoku->set(y, x, i); // if number fits, set it
if(solve(x+1, y, sudoku, printAnyFoundSolution, findAllSolutions)) { // tackle next field
#pragma omp atomic
found_sudokus++;
sudoku->incrementSolutionCounter(); // solution found :-)
if (printAnyFoundSolution) {
std::cout << "The following is a valid solution:" << std::endl;
sudoku->printBoard();
std::cout << std::endl;
}
if (!findAllSolutions) {
return true; // return, as only one solution was asked for
}
}
}
}
sudoku->set(y, x, 0); // no solution found, reset field
return false;
}
/**
* Solve the Sudoku puzzle in parallel by finding and counting all solutions
*/
bool solve_parallel(int x, int y, CSudokuBoard* sudoku, bool printAnyFoundSolution = true)
{
if (x == sudoku->getFieldSize()) { // end of line
y++;
x = 0;
if(y == sudoku->getFieldSize()) // end
return true;
}
if (sudoku->get(y, x) > 0) { // field already set
return solve_parallel(x+1, y, sudoku, printAnyFoundSolution); // tackle next field
}
#if USE_SEQUENTIAL_CUTOFF
if ( y > 1 ) {
return solve(x, y, sudoku, printAnyFoundSolution);
}
#endif
// TODO: Task candidates (for each possible value?)
for (int i = 1; i <= sudoku->getFieldSize(); i++) { // try all numbers
if (!sudoku->check(x, y, i)) {
{
CSudokuBoard new_sudoku(*sudoku);
new_sudoku.set(y, x, i); // if number fits, set it
if (solve_parallel(x+1, y, &new_sudoku, printAnyFoundSolution)) { // tackle next field
#pragma omp atomic
found_sudokus++;
// sudoku->incrementSolutionCounter(); // solution found :-)
if (printAnyFoundSolution) {
std::cout << "The following is a valid solution:" << std::endl;
new_sudoku.printBoard();
std::cout << std::endl;
}
}
}
}
}
// TODO: After creating tasks for this level we will need to synchronize
sudoku->set(y, x, 0); // no solution found, reset field
return false;
}
/** @brief program entry point
*/
int main(int argc, char* argv[]) {
// measure the time
double t3, t4;
// expect three command line arguments: field size, block size, and input file
if (argc != 4) {
std::cout << "Usage: sudoku.exe <field size> <block size> <input filename>" << std::endl;
std::cout << std::endl;
return -1;
}
else {
CSudokuBoard *sudoku1 = new CSudokuBoard(atoi(argv[1]), atoi(argv[2]));
if (!sudoku1->loadFromFile(argv[3])) {
std::cout << "There was an error reading a Sudoku template from " << argv[3] << std::endl;
std::cout << std::endl;
return -1;
}
// print the Sudoku board template
std::cout << "Given Sudoku template" << std::endl;
sudoku1->printBoard();
// solve the Sudoku by finding (and printing) all solutions
t3 = omp_get_wtime();
// TODO: Sudoku solver needs to execute within a parallel region
{
solve_parallel(0, 0, sudoku1, false);
}
t4 = omp_get_wtime();
std::cout << std::endl;
std::cout << "In total there were " << found_sudokus << " solutions found in parallel " << std::endl;
std::cout << std::endl;
delete sudoku1;
}
// print the time
std::cout << "Parallel computation took " << t4 - t3 << " seconds ("
<< omp_get_max_threads() << " threads)." << std::endl << std::endl;
return 0;
}

14
02-dependencies/Makefile Normal file
View file

@ -0,0 +1,14 @@
ALL_RECURSIVE=all-recursive clean-recursive
DIRS=$(shell ls -d *-* | sed '/_/d' 2>/dev/null)
all: $(CONFIG) all-recursive
clean: $(CONFIG) clean-recursive
$(ALL_RECURSIVE):
@failcom='exit 1';\
target=`echo $@ | sed s/-recursive//`; \
for subdir in $(DIRS); do \
(cd $$subdir && $(MAKE) $$target) || eval $$failcom; \
done;

View file

View file

@ -0,0 +1,29 @@
#!/bin/bash
#SBATCH --job-name=cholesky
#SBATCH --workdir=.
#SBATCH --output=cholesky_%j.out
#SBATCH --error=cholesky_%j.err
#SBATCH --cpus-per-task=48
#SBATCH --ntasks=1
#SBATCH --time=00:30:00
#SBATCH --qos=debug
export IFS=";"
THREADS="01;02;04;06;08;10;12;14;16;18;20;22;24;26;28;30;32;34;36;38;40;42;44;46;48"
#THREADS="01;02;04;06;08;10;12"
#MSIZES="2048;4096;8192"
MSIZES="8192"
MSIZES="16384"
#MSIZES="2048"
BSIZES="256"
for MS in $MSIZES; do
for BS in $BSIZES; do
for threads in $THREADS; do
OMP_NUM_THREADS=$threads ./cholesky-tsk-dep $MS $BS 0
done
done
done

View file

@ -0,0 +1,23 @@
BIN_DIR=../bin
PROGRAM=$(BIN_DIR)/cholesky-tsk-dep
MKL_HOME=$(MKLROOT)
CC=gcc
CFLAGS=-O3 -std=gnu99 -I$(MKL_HOME)/include -fopenmp
LDFLAGS=-L$(MKL_HOME)/lib/intel64 -lmkl_sequential -lmkl_core -lmkl_rt -lpthread -lm -fopenmp
#VERBOSE=-DVERBOSE
all: $(PROGRAM)
$(PROGRAM): cholesky.c
$(CC) $(CFLAGS) $(VERBOSE) -o $@ $^ $(LDFLAGS)
$(BIN_DIR):
mkdir $@
clean:
rm -rf $(PROGRAM) *.o
wipe: clean
rm -rf *.out *.err

View file

@ -0,0 +1,296 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <assert.h>
#include <sys/time.h>
#include <sys/times.h>
#include <math.h>
#include <mkl.h>
#include "omp.h"
#if !defined(_OPENMP)
int omp_get_max_threads() { return 1; }
int omp_get_num_threads() { return 1; }
#endif
void cholesky(int ts, int nt, double* Ah[nt][nt])
{
#ifdef VERBOSE
printf("> Computing Cholesky Factorization: indirect blocked matrix...\n");
#endif
#pragma omp parallel
#pragma omp single
for (int k = 0; k < nt; k++) {
// Diagonal Block factorization: using LAPACK
#pragma omp task depend(inout: Ah[k][k])
LAPACKE_dpotrf(LAPACK_COL_MAJOR, 'L', ts, Ah[k][k], ts);
// Triangular systems
for (int i = k + 1; i < nt; i++) {
#pragma omp task depend(in: Ah[k][k]) depend(inout: Ah[k][i])
cblas_dtrsm(CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, ts, ts, 1.0, Ah[k][k], ts, Ah[k][i], ts);
}
// Update trailing matrix
for (int i = k + 1; i < nt; i++) {
for (int j = k + 1; j < i; j++) {
#pragma omp task depend(in: Ah[k][i], Ah[k][j]) depend(inout: Ah[j][i])
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, ts, ts, ts, -1.0, Ah[k][i], ts, Ah[k][j], ts, 1.0, Ah[j][i], ts);
}
#pragma omp task depend(in: Ah[k][i]) depend(inout: Ah[i][i])
cblas_dsyrk(CblasColMajor, CblasLower, CblasNoTrans, ts, ts, -1.0, Ah[k][i], ts, 1.0, Ah[i][i], ts);
}
}
#pragma omp taskwait
#ifdef VERBOSE
printf("> ...end of Cholesky Factorization.\n");
#endif
}
float get_time()
{
static double gtod_ref_time_sec = 0.0;
struct timeval tv;
gettimeofday(&tv, NULL);
// If this is the first invocation of through dclock(), then initialize the
// "reference time" global variable to the seconds field of the tv struct.
if (gtod_ref_time_sec == 0.0) gtod_ref_time_sec = (double) tv.tv_sec;
// Normalize the seconds field of the tv struct so that it is relative to the
// "reference time" that was recorded during the first invocation of dclock().
const double norm_sec = (double) tv.tv_sec - gtod_ref_time_sec;
// Compute the number of seconds since the reference time.
const double t = norm_sec + tv.tv_usec * 1.0e-6;
return (float) t;
}
// Robust Check the factorization of the matrix A2
// Using directly Fortran services: dlacpy_, dtrmm, dlange_
static int check_factorization(int N, double *A1, double *A2, int LDA, char uplo, double eps)
{
#ifdef VERBOSE
printf("> Checking the Cholesky Factorization... \n");
#endif
char NORM = 'I', ALL = 'A', UP = 'U', LO = 'L', TR = 'T', NU = 'N', RI = 'R';
double *Residual = (double *) malloc(N*N*sizeof(double));
double *L1 = (double *) malloc(N*N*sizeof(double));
double *L2 = (double *) malloc(N*N*sizeof(double));
double *work = (double *) malloc(N*sizeof(double));
memset((void*)L1, 0, N*N*sizeof(double));
memset((void*)L2, 0, N*N*sizeof(double));
double alpha= 1.0;
dlacpy_(&ALL, &N, &N, A1, &LDA, Residual, &N);
/* Dealing with L'L or U'U */
if (uplo == 'U'){
dlacpy_(&UP, &N, &N, A2, &LDA, L1, &N);
dlacpy_(&UP, &N, &N, A2, &LDA, L2, &N);
dtrmm(&LO, &uplo, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
} else{
dlacpy_(&LO, &N, &N, A2, &LDA, L1, &N);
dlacpy_(&LO, &N, &N, A2, &LDA, L2, &N);
dtrmm(&RI, &LO, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
}
/* Compute the Residual || A -L'L|| */
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
Residual[j*N+i] = L2[j*N+i] - Residual[j*N+i];
double Rnorm = dlange_(&NORM, &N, &N, Residual, &N, work);
double Anorm = dlange_(&NORM, &N, &N, A1, &N, work);
#ifdef VERBOSE
printf("> - ||L'L-A||_oo/(||A||_oo.N.eps) = %e \n",Rnorm / (Anorm*N*eps));
#endif
const int info_factorization = isnan(Rnorm/(Anorm*N*eps)) || isinf(Rnorm/(Anorm*N*eps)) || (Rnorm/(Anorm*N*eps) > 60.0);
#ifdef VERBOSE
if ( info_factorization) printf("> - Factorization is suspicious!\n");
else printf("> - Factorization is CORRECT!\n");
#endif
free(Residual); free(L1); free(L2); free(work);
return info_factorization;
}
void initialize_matrix(const int n, const int ts, double *matrix)
{
#ifdef VERBOSE
printf("> Initializing matrix with random values...\n");
#endif
int ISEED[4] = {0,0,0,1};
int intONE=1;
for (int i = 0; i < n*n; i+=n) {
dlarnv_(&intONE, &ISEED[0], &n, &matrix[i]);
}
for (int i=0; i<n; i++) {
for (int j=0; j<n; j++) {
matrix[j*n+i] = matrix[j*n+i] + matrix[i*n+j];
matrix[i*n+j] = matrix[j*n+i];
}
}
// Diagonal values
for (int i = 0; i < n; i++) {
matrix[i*n+i] += (double) n;
}
}
void gather_block(const int N, const int ts, double *Alin, double *A)
{
for (int i = 0; i < ts; i++)
for (int j = 0; j < ts; j++) {
A[i*ts + j] = Alin[i*N + j];
}
}
void scatter_block(const int N, const int ts, double *A, double *Alin)
{
for (int i = 0; i < ts; i++)
for (int j = 0; j < ts; j++) {
Alin[i*N + j] = A[i*ts + j];
}
}
void convert_to_blocks(const int ts, const int DIM, const int N, double Alin[N][N], double *A[DIM][DIM])
{
#ifdef VERBOSE
printf("> Converting linear matrix to blocks...\n");
#endif
for (int i = 0; i < DIM; i++)
for (int j = 0; j < DIM; j++) {
gather_block ( N, ts, &Alin[i*ts][j*ts], A[i][j]);
}
}
void convert_to_linear(const int ts, const int DIM, const int N, double *A[DIM][DIM], double Alin[N][N])
{
#ifdef VERBOSE
printf("> Converting blocked matrix to linear...\n");
#endif
for (int i = 0; i < DIM; i++)
for (int j = 0; j < DIM; j++) {
scatter_block ( N, ts, A[i][j], (double *) &Alin[i*ts][j*ts]);
}
}
int main(int argc, char* argv[])
{
char *result[3] = {"n/a","pass","FAIL"};
const double eps = pow(2.0, -53);
// If number of arguments is not 4, print help
if ( argc != 4) {
printf( "%s: matrix_size block_size check[0|1]?\n", argv[0] );
exit( -1 );
}
const int n = atoi(argv[1]); // matrix size
const int ts = atoi(argv[2]); // tile size
int check = atoi(argv[3]); // check result?
// Compute number of tiles
const int nt = n / ts;
assert((nt*ts) == n); // tile size should divide size
// Allocate matrix
double * const matrix = (double *) malloc(n * n * sizeof(double));
assert(matrix != NULL);
// Initialize matrix
initialize_matrix(n, ts, matrix);
// Allocate original matrix, and duplicate it, for debugging purposes
double * const original_matrix = (double *) malloc(n * n * sizeof(double));
assert(original_matrix != NULL);
// Save a copy of the original matrix
for (int i = 0; i < n * n; i++ ) {
original_matrix[i] = matrix[i];
}
// Set version description: Indirect blocked matrix
const char *version = "I-Blocked matrix";
// Allocate blocked matrix
double *Ah[nt][nt];
for (int i = 0; i < nt; i++) {
for (int j = 0; j < nt; j++) {
Ah[i][j] = malloc(ts * ts * sizeof(double));
assert(Ah[i][j] != NULL);
}
}
// ---------------------------------------
// Convert, compute (time), and re-convert
// ---------------------------------------
convert_to_blocks(ts, nt, n, (double(*)[n]) matrix, Ah);
const float tref = get_time();
cholesky(ts, nt, (double* (*)[nt]) Ah);
const float time = get_time() - tref;
convert_to_linear(ts, nt, n, Ah, (double (*)[n]) matrix);
// Free blocked matrix
for (int i = 0; i < nt; i++) {
for (int j = 0; j < nt; j++) {
assert(Ah[i][j] != NULL);
free(Ah[i][j]);
}
}
// Check result, if requested
if ( check ) {
const char uplo = 'L';
if ( check_factorization( n, original_matrix, matrix, n, uplo, eps) ) check++;
}
// Free original matrix, not needed anymore
free(original_matrix);
// Compute GFLOPs (Not verified)
float gflops = (((1.0 / 3.0) * n * n * n) / ((time) * 1.0e+9));
// Print results
#ifdef VERBOSE
printf( "\n" );
printf( "============ CHOLESKY RESULTS ============\n" );
printf( " test %s\n", argv[0]);
printf( " version %s\n", version);
printf( " matrix size: %dx%d\n", n, n);
printf( " tile size: %dx%d\n", ts, ts);
printf( " number of threads: %d\n", omp_get_max_threads());
printf( " time (s): %f\n", time);
printf( " performance (gflops): %f\n", gflops);
printf( " check: %s\n", result[check]);
printf( "==========================================\n" );
#else
printf("test, %s, version, %s, n, %d, ts, %d, num_threads, %d, gflops, %f, time, %f, check, %s\n", argv[0], version, n, ts, omp_get_max_threads(), gflops, time, result[check]);
#endif
// Free matrix
free(matrix);
return 0;
}

View file

@ -0,0 +1,293 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <assert.h>
#include <sys/time.h>
#include <sys/times.h>
#include <math.h>
#include <mkl.h>
#include "omp.h"
#if !defined(_OPENMP)
int omp_get_max_threads() { return 1; }
int omp_get_num_threads() { return 1; }
#endif
void cholesky(int ts, int nt, double* Ah[nt][nt])
{
#ifdef VERBOSE
printf("> Computing Cholesky Factorization: indirect blocked matrix...\n");
#endif
// TODO: Parallel region is required in order to run in parallel.
// TODO: Kernel invocations could be task candidates.
// TODO: Add scheduler restrictions between tasks using dependences.
for (int k = 0; k < nt; k++) {
// Diagonal Block factorization: using LAPACK
LAPACKE_dpotrf(LAPACK_COL_MAJOR, 'L', ts, Ah[k][k], ts);
// Triangular systems
for (int i = k + 1; i < nt; i++) {
cblas_dtrsm(CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, ts, ts, 1.0, Ah[k][k], ts, Ah[k][i], ts);
}
// Update trailing matrix
for (int i = k + 1; i < nt; i++) {
for (int j = k + 1; j < i; j++) {
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, ts, ts, ts, -1.0, Ah[k][i], ts, Ah[k][j], ts, 1.0, Ah[j][i], ts);
}
cblas_dsyrk(CblasColMajor, CblasLower, CblasNoTrans, ts, ts, -1.0, Ah[k][i], ts, 1.0, Ah[i][i], ts);
}
}
// TODO: we need to guarantee all the work has finished at this point
#ifdef VERBOSE
printf("> ...end of Cholesky Factorization.\n");
#endif
}
float get_time()
{
static double gtod_ref_time_sec = 0.0;
struct timeval tv;
gettimeofday(&tv, NULL);
// If this is the first invocation of through dclock(), then initialize the
// "reference time" global variable to the seconds field of the tv struct.
if (gtod_ref_time_sec == 0.0) gtod_ref_time_sec = (double) tv.tv_sec;
// Normalize the seconds field of the tv struct so that it is relative to the
// "reference time" that was recorded during the first invocation of dclock().
const double norm_sec = (double) tv.tv_sec - gtod_ref_time_sec;
// Compute the number of seconds since the reference time.
const double t = norm_sec + tv.tv_usec * 1.0e-6;
return (float) t;
}
// Robust Check the factorization of the matrix A2
// Using directly Fortran services: dlacpy_, dtrmm, dlange_
static int check_factorization(int N, double *A1, double *A2, int LDA, char uplo, double eps)
{
#ifdef VERBOSE
printf("> Checking the Cholesky Factorization... \n");
#endif
char NORM = 'I', ALL = 'A', UP = 'U', LO = 'L', TR = 'T', NU = 'N', RI = 'R';
double *Residual = (double *) malloc(N*N*sizeof(double));
double *L1 = (double *) malloc(N*N*sizeof(double));
double *L2 = (double *) malloc(N*N*sizeof(double));
double *work = (double *) malloc(N*sizeof(double));
memset((void*)L1, 0, N*N*sizeof(double));
memset((void*)L2, 0, N*N*sizeof(double));
double alpha= 1.0;
dlacpy_(&ALL, &N, &N, A1, &LDA, Residual, &N);
/* Dealing with L'L or U'U */
if (uplo == 'U'){
dlacpy_(&UP, &N, &N, A2, &LDA, L1, &N);
dlacpy_(&UP, &N, &N, A2, &LDA, L2, &N);
dtrmm(&LO, &uplo, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
} else{
dlacpy_(&LO, &N, &N, A2, &LDA, L1, &N);
dlacpy_(&LO, &N, &N, A2, &LDA, L2, &N);
dtrmm(&RI, &LO, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
}
/* Compute the Residual || A -L'L|| */
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
Residual[j*N+i] = L2[j*N+i] - Residual[j*N+i];
double Rnorm = dlange_(&NORM, &N, &N, Residual, &N, work);
double Anorm = dlange_(&NORM, &N, &N, A1, &N, work);
#ifdef VERBOSE
printf("> - ||L'L-A||_oo/(||A||_oo.N.eps) = %e \n",Rnorm / (Anorm*N*eps));
#endif
const int info_factorization = isnan(Rnorm/(Anorm*N*eps)) || isinf(Rnorm/(Anorm*N*eps)) || (Rnorm/(Anorm*N*eps) > 60.0);
#ifdef VERBOSE
if ( info_factorization) printf("> - Factorization is suspicious!\n");
else printf("> - Factorization is CORRECT!\n");
#endif
free(Residual); free(L1); free(L2); free(work);
return info_factorization;
}
void initialize_matrix(const int n, const int ts, double *matrix)
{
#ifdef VERBOSE
printf("> Initializing matrix with random values...\n");
#endif
int ISEED[4] = {0,0,0,1};
int intONE=1;
for (int i = 0; i < n*n; i+=n) {
dlarnv_(&intONE, &ISEED[0], &n, &matrix[i]);
}
for (int i=0; i<n; i++) {
for (int j=0; j<n; j++) {
matrix[j*n+i] = matrix[j*n+i] + matrix[i*n+j];
matrix[i*n+j] = matrix[j*n+i];
}
}
// Diagonal values
for (int i = 0; i < n; i++) {
matrix[i*n+i] += (double) n;
}
}
void gather_block(const int N, const int ts, double *Alin, double *A)
{
for (int i = 0; i < ts; i++)
for (int j = 0; j < ts; j++) {
A[i*ts + j] = Alin[i*N + j];
}
}
void scatter_block(const int N, const int ts, double *A, double *Alin)
{
for (int i = 0; i < ts; i++)
for (int j = 0; j < ts; j++) {
Alin[i*N + j] = A[i*ts + j];
}
}
void convert_to_blocks(const int ts, const int DIM, const int N, double Alin[N][N], double *A[DIM][DIM])
{
#ifdef VERBOSE
printf("> Converting linear matrix to blocks...\n");
#endif
for (int i = 0; i < DIM; i++)
for (int j = 0; j < DIM; j++) {
gather_block ( N, ts, &Alin[i*ts][j*ts], A[i][j]);
}
}
void convert_to_linear(const int ts, const int DIM, const int N, double *A[DIM][DIM], double Alin[N][N])
{
#ifdef VERBOSE
printf("> Converting blocked matrix to linear...\n");
#endif
for (int i = 0; i < DIM; i++)
for (int j = 0; j < DIM; j++) {
scatter_block ( N, ts, A[i][j], (double *) &Alin[i*ts][j*ts]);
}
}
int main(int argc, char* argv[])
{
char *result[3] = {"n/a","pass","FAIL"};
const double eps = pow(2.0, -53);
// If number of arguments is not 4, print help
if ( argc != 4) {
printf( "%s: matrix_size block_size check[0|1]?\n", argv[0] );
exit( -1 );
}
const int n = atoi(argv[1]); // matrix size
const int ts = atoi(argv[2]); // tile size
int check = atoi(argv[3]); // check result?
// Compute number of tiles
const int nt = n / ts;
assert((nt*ts) == n); // tile size should divide size
// Allocate matrix
double * const matrix = (double *) malloc(n * n * sizeof(double));
assert(matrix != NULL);
// Initialize matrix
initialize_matrix(n, ts, matrix);
// Allocate original matrix, and duplicate it, for debugging purposes
double * const original_matrix = (double *) malloc(n * n * sizeof(double));
assert(original_matrix != NULL);
// Save a copy of the original matrix
for (int i = 0; i < n * n; i++ ) {
original_matrix[i] = matrix[i];
}
// Set version description: Indirect blocked matrix
const char *version = "I-Blocked matrix";
// Allocate blocked matrix
double *Ah[nt][nt];
for (int i = 0; i < nt; i++) {
for (int j = 0; j < nt; j++) {
Ah[i][j] = malloc(ts * ts * sizeof(double));
assert(Ah[i][j] != NULL);
}
}
// ---------------------------------------
// Convert, compute (time), and re-convert
// ---------------------------------------
convert_to_blocks(ts, nt, n, (double(*)[n]) matrix, Ah);
const float tref = get_time();
cholesky(ts, nt, (double* (*)[nt]) Ah);
const float time = get_time() - tref;
convert_to_linear(ts, nt, n, Ah, (double (*)[n]) matrix);
// Free blocked matrix
for (int i = 0; i < nt; i++) {
for (int j = 0; j < nt; j++) {
assert(Ah[i][j] != NULL);
free(Ah[i][j]);
}
}
// Check result, if requested
if ( check ) {
const char uplo = 'L';
if ( check_factorization( n, original_matrix, matrix, n, uplo, eps) ) check++;
}
// Free original matrix, not needed anymore
free(original_matrix);
// Compute GFLOPs (Not verified)
float gflops = (((1.0 / 3.0) * n * n * n) / ((time) * 1.0e+9));
// Print results
#ifdef VERBOSE
printf( "\n" );
printf( "============ CHOLESKY RESULTS ============\n" );
printf( " test %s\n", argv[0]);
printf( " version %s\n", version);
printf( " matrix size: %dx%d\n", n, n);
printf( " tile size: %dx%d\n", ts, ts);
printf( " number of threads: %d\n", omp_get_max_threads());
printf( " time (s): %f\n", time);
printf( " performance (gflops): %f\n", gflops);
printf( " check: %s\n", result[check]);
printf( "==========================================\n" );
#else
printf("test, %s, version, %s, n, %d, ts, %d, num_threads, %d, gflops, %f, time, %f, check, %s\n", argv[0], version, n, ts, omp_get_max_threads(), gflops, time, result[check]);
#endif
// Free matrix
free(matrix);
return 0;
}

14
03-cut-offs/Makefile Normal file
View file

@ -0,0 +1,14 @@
ALL_RECURSIVE=all-recursive clean-recursive
DIRS=$(shell ls -d */ | sed '/_/d' 2>/dev/null | sed '/bin/d' 2>/dev/null)
all: all-recursive
clean: clean-recursive
$(ALL_RECURSIVE):
@failcom='exit 1';\
target=`echo $@ | sed s/-recursive//`; \
for subdir in $(DIRS); do \
(cd $$subdir && $(MAKE) $$target) || eval $$failcom; \
done;

0
03-cut-offs/bin/.gitdir Normal file
View file

View file

@ -0,0 +1,24 @@
#!/bin/bash
#SBATCH --job-name=mergesort
#SBATCH --workdir=.
#SBATCH --output=mergesort_%j.out
#SBATCH --error=mergesort_%j.err
#SBATCH --cpus-per-task=48
#SBATCH --ntasks=1
#SBATCH --time=00:30:00
#SBATCH --qos=debug
export IFS=";"
THREADS="01;02;04;06;08;10;12;14;16;18;20;22;24;26;28;30;32;34;36;38;40;42;44;46;48"
#THREADS="01;02;04;06;08;10;12"
ASIZES="1000000"
for AS in $ASIZES; do
for threads in $THREADS; do
echo Running mergesort-co with $threads
OMP_NUM_THREADS=$threads ./mergesort-co $AS
echo --
done
done

20
03-cut-offs/bin/run-sudoku.sh Executable file
View file

@ -0,0 +1,20 @@
#!/bin/bash
#SBATCH --job-name=sudoku
#SBATCH --workdir=.
#SBATCH --output=sudoku_%j.out
#SBATCH --error=sudoku_%j.err
#SBATCH --cpus-per-task=48
#SBATCH --ntasks=1
#SBATCH --time=00:30:00
#SBATCH --qos=debug
export IFS=";"
THREADS="01;02;04;06;08;10;12;14;16;18;20;22;24;26;28;30;32;34;36;38;40;42;44;46;48"
#THREADS="01;02;04;06;08;10;12"
for threads in $THREADS; do
#OMP_NUM_THREADS=$threads ./sudoku-co 9 3 sudoku-9x9-1.txt
OMP_NUM_THREADS=$threads ./sudoku-co 16 4 sudoku-16x16-1.txt
#OMP_NUM_THREADS=$threads ./sudoku-co 25 5 sudoku-25x25-1.txt
done

View file

@ -0,0 +1,16 @@
0 6 0 0 0 0 0 8 11 0 0 15 14 0 0 16
15 11 0 0 0 16 14 0 0 0 12 0 0 6 0 0
13 0 9 12 0 0 0 0 3 16 14 0 15 11 10 0
2 0 16 0 11 0 15 10 1 0 0 0 0 0 0 0
0 15 11 10 0 0 16 2 13 8 9 12 0 0 0 0
12 13 0 0 4 1 5 6 2 3 0 0 0 0 11 10
5 0 6 1 12 0 9 0 15 11 10 7 16 0 0 3
0 2 0 0 0 10 0 11 6 0 5 0 0 13 0 9
10 7 15 11 16 0 0 0 12 13 0 0 0 0 0 6
9 0 0 0 0 0 1 0 0 2 0 16 10 0 0 11
1 0 4 6 9 13 0 0 7 0 11 0 3 16 0 0
16 14 0 0 7 0 10 15 4 6 1 0 0 0 13 8
11 10 0 15 0 0 0 16 9 12 13 0 0 1 5 4
0 0 12 0 1 4 6 0 16 0 0 0 11 10 0 0
0 0 5 0 8 12 13 0 10 0 0 11 2 0 0 14
3 16 0 0 10 0 0 7 0 0 6 0 0 0 12 0

View file

@ -0,0 +1,25 @@
0 8 0 18 0 12 20 21 16 0 0 0 0 0 0 15 0 0 24 5 7 0 19 0 0
0 0 15 9 5 4 0 23 0 18 12 20 0 21 16 7 0 0 0 2 1 0 0 0 0
0 0 0 0 14 7 0 0 0 0 0 24 0 0 0 0 10 0 0 16 0 0 23 18 6
21 20 12 0 16 0 0 25 14 22 7 0 0 19 2 4 0 23 8 6 0 24 0 0 0
19 0 7 0 2 15 0 0 0 9 0 0 18 0 0 0 0 0 0 0 12 0 21 0 16
0 23 6 0 0 16 0 0 0 20 14 25 11 0 0 5 24 7 3 9 2 19 1 0 17
4 21 0 20 0 0 0 12 22 11 2 19 0 0 0 0 0 15 23 0 0 3 7 0 9
12 25 14 0 22 0 19 0 0 13 0 0 24 7 0 0 0 0 21 10 6 0 15 8 18
7 0 0 24 9 6 0 15 18 0 0 0 0 0 10 0 13 1 19 0 0 25 0 11 22
0 19 2 13 17 5 0 0 9 0 6 0 8 15 0 14 11 12 25 22 16 0 0 0 10
0 0 0 1 0 0 0 0 3 7 8 0 15 0 0 11 12 10 0 0 0 0 18 4 0
10 16 11 12 25 0 14 22 19 1 0 0 0 17 3 20 0 0 6 21 8 0 0 0 0
0 0 8 0 23 20 0 18 21 4 11 16 0 0 25 0 7 0 2 0 0 14 22 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 22 19 8 15 0 5 23 24 0 17 0 3
0 2 0 7 3 8 0 9 23 15 20 0 4 0 21 13 1 0 14 19 0 0 10 12 25
2 0 0 0 0 18 15 5 0 23 0 0 21 6 20 17 0 14 1 13 0 12 0 25 11
0 1 0 19 13 9 7 0 24 0 0 0 23 0 8 0 25 0 0 11 0 0 0 21 0
0 0 10 0 0 0 0 0 0 0 17 1 0 14 0 18 0 5 15 8 9 0 0 0 0
16 12 0 0 11 0 0 14 0 0 0 0 3 2 24 0 21 0 4 20 0 15 0 0 8
0 0 0 0 0 10 0 0 0 0 0 12 0 0 11 9 3 0 0 0 17 0 14 19 13
24 0 0 0 15 21 18 0 4 0 0 0 16 20 0 3 2 13 0 7 19 22 0 0 1
0 18 0 0 0 0 10 0 0 16 0 22 0 0 0 23 0 24 9 0 3 17 0 0 0
0 0 0 0 0 0 0 0 15 5 0 0 6 8 4 0 14 11 22 1 25 0 20 16 12
0 22 19 14 1 3 0 0 0 0 23 9 5 0 15 25 0 20 10 12 21 0 8 0 4
20 0 25 16 0 0 22 11 1 0 3 0 2 13 7 21 0 0 0 0 0 9 0 0 0

View file

@ -0,0 +1,10 @@
5 3 0 0 7 0 0 0 0
6 0 0 1 9 5 3 0 0
0 9 8 0 0 0 0 6 0
8 0 0 0 6 0 0 0 3
4 0 0 8 0 3 0 0 1
7 0 0 0 2 0 0 0 6
0 6 0 0 0 0 2 8 0
0 0 0 4 1 9 0 0 5
0 0 0 0 8 0 0 7 9

View file

@ -0,0 +1,20 @@
BIN_DIR=../bin
PROGRAM=$(BIN_DIR)/mergesort-co
CC=g++
CFLAGS=-O3 -fopenmp
LDFLAGS=-fopenmp
all: $(PROGRAM)
$(PROGRAM): mergesort.cpp
$(CC) $(CFLAGS) $(VERBOSE) -o $@ $^ $(LDFLAGS)
$(BIN_DIR):
mkdir $@
clean:
rm -rf $(PROGRAM) *.o
wipe: clean
rm -rf *.out *.err

View file

@ -0,0 +1,272 @@
/*
* This file is part of Christian's OpenMP software lab
*
* Copyright (C) 2016 by Christian Terboven <terboven@itc.rwth-aachen.de>
* Copyright (C) 2016 by Jonas Hahnfeld <hahnfeld@itc.rwth-aachen.de>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <sys/time.h>
#include <iostream>
#include <algorithm>
#include <cstdlib>
#include <cstdio>
#include <cmath>
#include <ctime>
#include <cstring>
#include <omp.h>
/**
* helper routine: check if array is sorted correctly
*/
bool isSorted(int ref[], int data[], const size_t size){
std::sort(ref, ref + size);
for (size_t idx = 0; idx < size; ++idx){
if (ref[idx] != data[idx]) {
return false;
}
}
return true;
}
/**
* sequential merge step (straight-forward implementation)
*/
void MsMergeSequential(int *out, int *in, long begin1, long end1, long begin2, long end2, long outBegin) {
long left = begin1;
long right = begin2;
long idx = outBegin;
while (left < end1 && right < end2) {
if (in[left] <= in[right]) {
out[idx] = in[left];
left++;
} else {
out[idx] = in[right];
right++;
}
idx++;
}
while (left < end1) {
out[idx] = in[left];
left++, idx++;
}
while (right < end2) {
out[idx] = in[right];
right++, idx++;
}
}
/**
* sequential MergeSort
*/
void MsSequential(int *array, int *tmp, bool inplace, long begin, long end) {
if (begin < (end - 1)) {
const long half = (begin + end) / 2;
MsSequential(array, tmp, !inplace, begin, half);
MsSequential(array, tmp, !inplace, half, end);
if (inplace) {
MsMergeSequential(array, tmp, begin, half, half, end, begin);
} else {
MsMergeSequential(tmp, array, begin, half, half, end, begin);
}
} else if (!inplace) {
tmp[begin] = array[begin];
}
}
/**
* parallel merge step (straight-forward implementation)
*/
void MsMergeParallel(int *out, int *in, long begin1, long end1, long begin2, long end2, long outBegin, int deep) {
if (deep) {
long half1, half2, tmp, count, step;
if ((end1 - begin1) < (end2 - begin2)) {
half2 = (begin2 + end2) / 2;
// find in[half2] in [begin1, end1) (std::upper_bound)
half1 = begin1, count = (end1 - begin1);
while (count > 0) {
step = count / 2;
tmp = half1 + step;
if (in[tmp] <= in[half2]) {
tmp++;
half1 = tmp;
count -= step + 1;
} else {
count = step;
}
}
} else {
half1 = (begin1 + end1) / 2;
// find in[half1] in [begin2, end2) (std::lower_bound)
half2 = begin2, count = (end2 - begin2);
while (count > 0) {
step = count / 2;
tmp = half2 + step;
if (in[tmp] < in[half1]) {
tmp++;
half2 = tmp;
count -= step + 1;
} else {
count = step;
}
}
}
#pragma omp task default(shared)
{
MsMergeParallel(out, in, begin1, half1, begin2, half2, outBegin, deep - 1);
}
long outBegin2 = outBegin + (half1 - begin1) + (half2 - begin2);
#pragma omp task default(shared)
{
MsMergeParallel(out, in, half1, end1, half2, end2, outBegin2, deep - 1);
}
#pragma omp taskwait
} else {
MsMergeSequential(out, in, begin1, end1, begin2, end2, outBegin);
}
}
/**
* OpenMP Task-parallel MergeSort
*/
void MsParallel(int *array, int *tmp, bool inplace, long begin, long end, int deep) {
if (begin < (end - 1)) {
long half = (begin + end) / 2;
if (deep){
#pragma omp task default(shared)
{
MsParallel(array, tmp, !inplace, begin, half, deep - 1);
}
#pragma omp task default(shared)
{
MsParallel(array, tmp, !inplace, half, end, deep - 1);
}
#pragma omp taskwait
}
else {
MsSequential(array, tmp, !inplace, begin, half);
MsSequential(array, tmp, !inplace, half, end);
}
if (inplace) {
MsMergeParallel(array, tmp, begin, half, half, end, begin, deep);
} else {
MsMergeParallel(tmp, array, begin, half, half, end, begin, deep);
}
} else if (!inplace) {
tmp[begin] = array[begin];
}
}
/**
* OpenMP Task-parallel MergeSort
* startup routine containing the Parallel Region
*/
void MsParallelOmp(int *array, int *tmp, const size_t size) {
// compute cut-off recursion level
const int iMinTask = (omp_get_max_threads() * 5);
int deep = 0;
while ((1 << deep) < iMinTask) deep += 1;
#pragma omp parallel
#pragma omp master
{
MsParallel(array, tmp, true, 0, size, deep);
}
}
/**
* @brief program entry point
*/
int main(int argc, char* argv[]) {
// variables to measure the elapsed time
struct timeval t1, t2;
double etime;
// expect one command line arguments: array size
if (argc != 2) {
printf("Usage: MergeSort.exe <array size> \n");
printf("\n");
return EXIT_FAILURE;
}
else {
const size_t stSize = strtol(argv[1], NULL, 10);
int *data = (int*) malloc(stSize * sizeof(int));
int *tmp = (int*) malloc(stSize * sizeof(int));
int *ref = (int*) malloc(stSize * sizeof(int));
// first touch
#pragma omp parallel for
for (size_t idx = 0; idx < stSize; ++idx){
data[idx] = 0;
tmp[idx] = 0;
}
printf("Initialization...\n");
srand(95);
for (size_t idx = 0; idx < stSize; ++idx){
data[idx] = (int) (stSize * (double(rand()) / RAND_MAX));
}
std::copy(data, data + stSize, ref);
double dSize = (stSize * sizeof(int)) / 1024 / 1024;
printf("Sorting %zu elements of type int (%f MiB)...\n", stSize, dSize);
gettimeofday(&t1, NULL);
MsParallelOmp(data, tmp, stSize);
gettimeofday(&t2, NULL);
etime = (t2.tv_sec - t1.tv_sec) * 1000 + (t2.tv_usec - t1.tv_usec) / 1000;
etime = etime / 1000;
printf("done, took %f sec. Verification...", etime);
if (isSorted(ref, data, stSize)) {
printf(" successful.\n");
}
else {
printf(" FAILED.\n");
}
free(data);
free(tmp);
free(ref);
}
return EXIT_SUCCESS;
}

View file

@ -0,0 +1,174 @@
/*
* This file is part of Christian's OpenMP software lab
*
* Copyright (C) 2016 by Christian Terboven <terboven@itc.rwth-aachen.de>
* Copyright (C) 2016 by Jonas Hahnfeld <hahnfeld@itc.rwth-aachen.de>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <sys/time.h>
#include <iostream>
#include <algorithm>
#include <cstdlib>
#include <cstdio>
#include <cmath>
#include <ctime>
#include <cstring>
/**
* helper routine: check if array is sorted correctly
*/
bool isSorted(int ref[], int data[], const size_t size){
std::sort(ref, ref + size);
for (size_t idx = 0; idx < size; ++idx){
if (ref[idx] != data[idx]) {
return false;
}
}
return true;
}
/**
* sequential merge step (straight-forward implementation)
*/
// TODO: cut-off could also apply here (extra parameter?)
// TODO: optional: we can also break merge in two halves
void MsMergeSequential(int *out, int *in, long begin1, long end1, long begin2, long end2, long outBegin) {
long left = begin1;
long right = begin2;
long idx = outBegin;
while (left < end1 && right < end2) {
if (in[left] <= in[right]) {
out[idx] = in[left];
left++;
} else {
out[idx] = in[right];
right++;
}
idx++;
}
while (left < end1) {
out[idx] = in[left];
left++, idx++;
}
while (right < end2) {
out[idx] = in[right];
right++, idx++;
}
}
/**
* sequential MergeSort
*/
// TODO: remember one additional parameter (depth)
// TODO: recursive calls could be taskyfied
// TODO: task synchronization also is required
void MsSequential(int *array, int *tmp, bool inplace, long begin, long end) {
if (begin < (end - 1)) {
const long half = (begin + end) / 2;
MsSequential(array, tmp, !inplace, begin, half);
MsSequential(array, tmp, !inplace, half, end);
if (inplace) {
MsMergeSequential(array, tmp, begin, half, half, end, begin);
} else {
MsMergeSequential(tmp, array, begin, half, half, end, begin);
}
} else if (!inplace) {
tmp[begin] = array[begin];
}
}
/**
* Serial MergeSort
*/
// TODO: this function should create the parallel region
// TODO: good point to compute a good depth level (cut-off)
void MsSerial(int *array, int *tmp, const size_t size) {
// TODO: parallel version of MsSequential will receive one more parameter: 'depth' (used as cut-off)
MsSequential(array, tmp, true, 0, size);
}
/**
* @brief program entry point
*/
int main(int argc, char* argv[]) {
// variables to measure the elapsed time
struct timeval t1, t2;
double etime;
// expect one command line arguments: array size
if (argc != 2) {
printf("Usage: MergeSort.exe <array size> \n");
printf("\n");
return EXIT_FAILURE;
}
else {
const size_t stSize = strtol(argv[1], NULL, 10);
int *data = (int*) malloc(stSize * sizeof(int));
int *tmp = (int*) malloc(stSize * sizeof(int));
int *ref = (int*) malloc(stSize * sizeof(int));
printf("Initialization...\n");
srand(95);
for (size_t idx = 0; idx < stSize; ++idx){
data[idx] = (int) (stSize * (double(rand()) / RAND_MAX));
}
std::copy(data, data + stSize, ref);
double dSize = (stSize * sizeof(int)) / 1024 / 1024;
printf("Sorting %zu elements of type int (%f MiB)...\n", stSize, dSize);
gettimeofday(&t1, NULL);
MsSerial(data, tmp, stSize);
gettimeofday(&t2, NULL);
etime = (t2.tv_sec - t1.tv_sec) * 1000 + (t2.tv_usec - t1.tv_usec) / 1000;
etime = etime / 1000;
printf("done, took %f sec. Verification...", etime);
if (isSorted(ref, data, stSize)) {
printf(" successful.\n");
}
else {
printf(" FAILED.\n");
}
free(data);
free(tmp);
free(ref);
}
return EXIT_SUCCESS;
}

View file

@ -0,0 +1,29 @@
BIN_DIR=../bin
PROGRAM=$(BIN_DIR)/sudoku-co
CC=g++
CFLAGS=-c -O3 -fopenmp -DUSE_SEQUENTIAL_CUTOFF
LDFLAGS=-fopenmp
OBJS=SudokuBoard.o sudoku.o
all: $(PROGRAM)
$(PROGRAM): $(OBJS)
$(CC) -o $@ $^ $(LDFLAGS)
%.o: %.cpp
$(CC) $(CFLAGS) -o $@ $^
sudoku.cpp:
@cat README.md
@false
$(BIN_DIR):
mkdir $@
clean:
rm -rf $(PROGRAM) *.o
wipe: clean
rm -rf *.out *.err

View file

@ -0,0 +1,5 @@
====================================================================================================
Copy your previous version of 01-tasks/sudoku-tsk exercise (sudoku.c) and modify the code in order
to use a native OpenMP cut-off mechanism (ie, the 'if' or the 'final' clauses) instead of the
manual cut-off version implemented as: "USE_SEQUENTIAL_CUTOFF".
====================================================================================================

View file

@ -0,0 +1,119 @@
/*
* This file is part of Christian's OpenMP Task-parallel Sudoku Solver
*
* Copyright (C) 2013 by Christian Terboven <christian@terboven.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "SudokuBoard.h"
#include <cstring>
CSudokuBoard::CSudokuBoard(int fsize, int bsize)
: field_size(fsize), block_size(bsize), solutions(-1)
{
field = new int[field_size*field_size];
}
CSudokuBoard::CSudokuBoard(const CSudokuBoard& other)
: field_size(other.getFieldSize()), block_size(other.getBlockSize()), solutions(other.getNumSolutions())
{
field = new int[field_size*field_size];
std::memcpy(field, other.field, sizeof(int) * field_size*field_size);
}
CSudokuBoard::~CSudokuBoard(void)
{
delete[] field;
}
bool CSudokuBoard::loadFromFile(char *filename)
{
std::ifstream ifile(filename);
if (!ifile) {
std::cout << "There was an error opening the input file " << filename << std::endl;
std::cout << std::endl;
return false;
}
for (int i = 0; i < this->field_size; i++) {
for (int j = 0; j < this->field_size; j++) {
ifile >> this->field[ACCESS(i,j)];
}
}
return true;
}
void CSudokuBoard::printBoard()
{
for(int i = 0; i < field_size; i++) {
for(int j = 0; j < field_size; j++) {
std::cout << std::setw(3) <<
this->field[ACCESS(i,j)]
<< " ";
}
std::cout << std::endl;
}
}
bool CSudokuBoard::check(int x, int y, int value)
{
if(checkHorizontal(y, value))
return true;
if(checkVertical(x, value))
return true;
if(checkBox(x, y, value))
return true;
return false;
}
bool CSudokuBoard::checkHorizontal(int y, int value) {
for(int i = 0; i < field_size; i++)
if(field[ACCESS(y,i)] == value)
return true;
return false;
}
bool CSudokuBoard::checkVertical(int x, int value) {
for(int i = 0; i < field_size; i++)
if(field[ACCESS(i,x)] == value)
return true;
return false;
}
bool CSudokuBoard::checkBox(int x, int y, int value) {
// find suitable box edge
int x_box = (int)(x / block_size) * block_size;
int y_box = (int)(y / block_size) * block_size;
for(int i = y_box; i < y_box + block_size; i++)
for(int j = x_box; j < x_box + block_size; j++)
if(field[ACCESS(i,j)] == value)
return true;
return false;
}

View file

@ -0,0 +1,96 @@
/*
* This file is part of Christian's OpenMP Task-parallel Sudoku Solver
*
* Copyright (C) 2013 by Christian Terboven <christian@terboven.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include <iostream>
#include <iomanip>
#include <fstream>
#include <vector>
#define ACCESS(x, y) (this->field_size*(x) + (y))
/**
* @brief Representation of the Sudoku board, includes solver and utility functions
*/
class CSudokuBoard
{
public:
CSudokuBoard(int fsize, int bsize);
CSudokuBoard(const CSudokuBoard& other);
~CSudokuBoard(void);
inline int getNumSolutions() const {
return this->solutions;
}
inline void incrementSolutionCounter() {
if (this->solutions == -1)
this->solutions = 1;
else
this->solutions++;
}
inline int getFieldSize() const {
return this->field_size;
}
inline int getBlockSize() const {
return this->block_size;
}
inline int get(int x, int y) const {
return this->field[ACCESS(x,y)];
}
inline void set(int x, int y, int value) {
this->field[ACCESS(x, y)] = value;
}
/**
* Read Sudoku template from file
* @param filename name of file to read input from
* @return true if reading file was successful, false if not
*/
bool loadFromFile(char *filename);
/**
* Print the Sudoku board to stdout
*/
void printBoard();
/**
* Check whether value already used
* @return true if already used, false if not
*/
bool check(int x, int y, int value);
private:
bool checkHorizontal(int y, int value);
bool checkVertical(int x, int value);
bool checkBox(int x, int y, int value);
int field_size;
int block_size;
int *field;
int solutions;
};

View file

@ -0,0 +1,177 @@
/*
* This file is part of Christian's OpenMP Task-parallel Sudoku Solver
*
* Copyright (C) 2013 by Christian Terboven <christian@terboven.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include "SudokuBoard.h"
int found_sudokus = 0;
/**
* Solve the Sudoku puzzle by finding and counting all solutions (if all are requested)
*/
bool solve(int x, int y, CSudokuBoard* sudoku, bool printAnyFoundSolution = true, bool findAllSolutions = true)
{
if(x == sudoku->getFieldSize()) { // end of line
y++;
x = 0;
if(y == sudoku->getFieldSize()) // end
return true;
}
if(sudoku->get(y, x) > 0) // field already set
return solve(x+1, y, sudoku, printAnyFoundSolution, findAllSolutions); // tackle next field
for(int i = 1; i <= sudoku->getFieldSize(); i++) { // try all numbers
if(!sudoku->check(x, y, i)) {
sudoku->set(y, x, i); // if number fits, set it
if(solve(x+1, y, sudoku, printAnyFoundSolution, findAllSolutions)) { // tackle next field
#pragma omp atomic
found_sudokus++;
sudoku->incrementSolutionCounter(); // solution found :-)
if (printAnyFoundSolution) {
std::cout << "The following is a valid solution:" << std::endl;
sudoku->printBoard();
std::cout << std::endl;
}
if (!findAllSolutions) {
return true; // return, as only one solution was asked for
}
}
}
}
sudoku->set(y, x, 0); // no solution found, reset field
return false;
}
/**
* Solve the Sudoku puzzle in parallel by finding and counting all solutions
*/
bool solve_parallel(int x, int y, CSudokuBoard* sudoku, bool printAnyFoundSolution = true)
{
if (x == sudoku->getFieldSize()) { // end of line
y++;
x = 0;
if(y == sudoku->getFieldSize()) // end
return true;
}
if (sudoku->get(y, x) > 0) { // field already set
return solve_parallel(x+1, y, sudoku, printAnyFoundSolution); // tackle next field
}
//#if USE_SEQUENTIAL_CUTOFF
// if ( y > 1 ) {
// return solve(x, y, sudoku, printAnyFoundSolution);
// }
//#endif
for (int i = 1; i <= sudoku->getFieldSize(); i++) { // try all numbers
if (!sudoku->check(x, y, i)) {
#pragma omp task firstprivate(i,x,y,sudoku) final(y > 0)
{
CSudokuBoard new_sudoku(*sudoku);
new_sudoku.set(y, x, i); // if number fits, set it
if (solve_parallel(x+1, y, &new_sudoku, printAnyFoundSolution)) { // tackle next field
#pragma omp atomic
found_sudokus++;
// sudoku->incrementSolutionCounter(); // solution found :-)
if (printAnyFoundSolution) {
std::cout << "The following is a valid solution:" << std::endl;
new_sudoku.printBoard();
std::cout << std::endl;
}
}
}
}
}
#pragma omp taskwait
sudoku->set(y, x, 0); // no solution found, reset field
return false;
}
/** @brief program entry point
*/
int main(int argc, char* argv[]) {
// measure the time
double t3, t4;
int nthreads = 0;
// expect three command line arguments: field size, block size, and input file
if (argc != 4) {
std::cout << "Usage: sudoku.exe <field size> <block size> <input filename>" << std::endl;
std::cout << std::endl;
return -1;
}
else {
CSudokuBoard *sudoku1 = new CSudokuBoard(atoi(argv[1]), atoi(argv[2]));
if (!sudoku1->loadFromFile(argv[3])) {
std::cout << "There was an error reading a Sudoku template from " << argv[3] << std::endl;
std::cout << std::endl;
return -1;
}
// print the Sudoku board template
std::cout << "Given Sudoku template" << std::endl;
sudoku1->printBoard();
// solve the Sudoku by finding (and printing) all solutions
t3 = omp_get_wtime();
#pragma omp parallel sections
{
solve_parallel(0, 0, sudoku1, false);
}
t4 = omp_get_wtime();
#pragma omp parallel
{
#pragma omp master
nthreads = omp_get_num_threads();
}
std::cout << std::endl;
std::cout << "In total there were " << sudoku1->getNumSolutions() << " solutions found in serial." << std::endl;
std::cout << "In total there were " << found_sudokus << " solutions found in parallel " << std::endl;
std::cout << std::endl;
delete sudoku1;
}
// print the time
std::cout << "Parallel computation took " << t4 - t3 << " seconds ("
<< nthreads << " threads)." << std::endl << std::endl;
return 0;
}

23
04-matmul/Makefile Normal file
View file

@ -0,0 +1,23 @@
BIN_DIR=.
PROGRAM=$(BIN_DIR)/matmul
MKL_HOME=$(MKLROOT)
CC=gcc
CFLAGS=-O3 -std=gnu99 -fopenmp
LDFLAGS=-fopenmp
#VERBOSE=-DVERBOSE
all: $(PROGRAM)
$(PROGRAM): matmul.c
$(CC) $(CFLAGS) $(VERBOSE) -o $@ $^ $(LDFLAGS)
$(BIN_DIR):
mkdir $@
clean:
rm -rf $(PROGRAM) *.o
wipe: clean
rm -rf *.out *.err

174
04-matmul/matmul.c Normal file
View file

@ -0,0 +1,174 @@
//===-- matmul.c - Different implementations of matrix multiplies -*- C -*-===//
//
// Part of the LOMP Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#define N 3072
#define DUMP_MATRIX 0
#define DUMP_TASKS 0
void matmul_seq(float * C, float * A, float * B, size_t n) {
for (size_t i = 0; i < n; ++i) {
for (size_t k = 0; k < n; ++k) {
for (size_t j = 0; j < n; ++j) {
C[i * n + j] += A[i * n + k] * B[k * n + j];
}
}
}
}
void matmul_par(float * C, float * A, float * B, size_t n) {
#pragma omp parallel for schedule(static,8) firstprivate(n)
for (size_t i = 0; i < n; ++i) {
for (size_t k = 0; k < n; ++k) {
for (size_t j = 0; j < n; ++j) {
C[i * n + j] += A[i * n + k] * B[k * n + j];
}
}
}
}
void matmul_tloop(float * C, float * A, float * B, size_t n) {
#pragma omp parallel firstprivate(n)
#pragma omp single nowait
#pragma omp taskloop
for (size_t i = 0; i < n; ++i) {
for (size_t k = 0; k < n; ++k) {
for (size_t j = 0; j < n; ++j) {
C[i * n + j] += A[i * n + k] * B[k * n + j];
}
}
}
}
void matmul_task(float * C, float * A, float * B, size_t n) {
const size_t bf = 256;
if (n % bf != 0) {
printf("Blocking factor does not divide matrix size!\n");
exit(EXIT_FAILURE);
}
#pragma omp parallel firstprivate(n, bf)
#pragma omp master
{
// work on the blocks of the matrix
for (size_t ib = 0; ib < n; ib += bf)
for (size_t kb = 0; kb < n; kb += bf)
for (size_t jb = 0; jb < n; jb += bf) {
#pragma omp task firstprivate(ib, kb, jb) firstprivate(n, bf) \
depend(inout:C[ib * n + jb:bf]) \
depend(in:A[ib * n + kb:bf]) \
depend(in:B[kb * n + jb:bf])
{
#if DUMP_TASKS
printf("task: C[%ld,%ld] += A[%ld,%ld] o B[%ld,%ld]\n", ib, jb, ib, kb, kb, jb);
#endif
// work on a single block``
for (size_t i = ib; i < (ib + bf); ++i)
for (size_t k = kb; k < (kb + bf); ++k)
for (size_t j = jb; j < (jb + bf); ++j)
C[i * n + j] += A[i * n + k] * B[k * n + j];
}
}
}
}
void init_mat(float * C, float * A, float * B, size_t n) {
for (size_t i = 0; i < n; ++i) {
for (size_t j = 0; j < n; ++j) {
C[i * n + j] = 0.0;
A[i * n + j] = 0.5;
B[i * n + j] = 0.25;
}
}
}
void dump_mat(float * mtx, size_t n) {
for (size_t i = 0; i < n; ++i) {
for (size_t j = 0; j < n; ++j) {
printf("%f ", mtx[i * n + j]);
}
printf("\n");
}
}
float sum_mat(float * mtx, size_t n) {
float sum = 0.0;
for (size_t i = 0; i < n; ++i) {
for (size_t j = 0; j < n; ++j) {
sum += mtx[i * n + j];
}
}
return sum;
}
int main(int argc, char * argv[]) {
double ts, te;
double t_seq;
float * C;
float * A;
float * B;
// If number of arguments is not 1, print help
if (argc != 2) {
printf("%s: matrix_size\n", argv[0]);
return EXIT_FAILURE;
}
const int n = atoi(argv[1]); // matrix size
C = (float *) malloc(sizeof(*C) * n * n);
A = (float *) malloc(sizeof(*A) * n * n);
B = (float *) malloc(sizeof(*B) * n * n);
init_mat(C, A, B, n);
ts = omp_get_wtime();
matmul_seq(C, A, B, n);
te = omp_get_wtime();
#if DUMP_MATRIX
dump_mat(C, n);
#endif
t_seq = te - ts;
printf("Sum of matrix (serial): %f, wall time %lf, speed-up %.2lf\n",
sum_mat(C, n), (te-ts), t_seq / (te-ts));
init_mat(C, A, B, n);
ts = omp_get_wtime();
matmul_par(C, A, B, n);
te = omp_get_wtime();
#if DUMP_MATRIX
dump_mat(C, n);
#endif
printf("Sum of matrix (parallel): %f, wall time %lf, speed-up %.2lf\n",
sum_mat(C, n), (te-ts), t_seq / (te-ts));
init_mat(C, A, B, n);
ts = omp_get_wtime();
matmul_tloop(C, A, B, n);
te = omp_get_wtime();
#if DUMP_MATRIX
dump_mat(C, n);
#endif
printf("Sum of matrix (taskloop): %f, wall time %lf, speed-up %.2lf\n",
sum_mat(C, n), (te-ts), t_seq / (te-ts));
init_mat(C, A, B, n);
ts = omp_get_wtime();
matmul_task(C, A, B, n);
te = omp_get_wtime();
#if DUMP_MATRIX
dump_mat(C, n);
#endif
printf("Sum of matrix (tasks): %f, wall time %lf, speed-up %.2lf\n",
sum_mat(C, n), (te-ts), t_seq / (te-ts));
return EXIT_SUCCESS;
}

165
04-matmul/solution/matmul.c Normal file
View file

@ -0,0 +1,165 @@
//===-- matmul.c - Different implementations of matrix multiplies -*- C -*-===//
//
// Part of the LOMP Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#define N 3072
#define DUMP_MATRIX 0
#define DUMP_TASKS 0
void matmul_seq(float * C, float * A, float * B, size_t n) {
for (size_t i = 0; i < n; ++i) {
for (size_t k = 0; k < n; ++k) {
for (size_t j = 0; j < n; ++j) {
C[i * n + j] += A[i * n + k] * B[k * n + j];
}
}
}
}
void matmul_par(float * C, float * A, float * B, size_t n) {
#pragma omp parallel for schedule(static,8) firstprivate(n)
for (size_t i = 0; i < n; ++i) {
for (size_t k = 0; k < n; ++k) {
for (size_t j = 0; j < n; ++j) {
C[i * n + j] += A[i * n + k] * B[k * n + j];
}
}
}
}
void matmul_tloop(float * C, float * A, float * B, size_t n) {
for (size_t i = 0; i < n; ++i) {
for (size_t k = 0; k < n; ++k) {
for (size_t j = 0; j < n; ++j) {
C[i * n + j] += A[i * n + k] * B[k * n + j];
}
}
}
}
void matmul_task(float * C, float * A, float * B, size_t n) {
const size_t bf = 256;
if (n % bf != 0) {
printf("Blocking factor does not divide matrix size!\n");
exit(EXIT_FAILURE);
}
{
// work on the blocks of the matrix
for (size_t ib = 0; ib < n; ib += bf)
for (size_t kb = 0; kb < n; kb += bf)
for (size_t jb = 0; jb < n; jb += bf) {
{
#if DUMP_TASKS
printf("task: C[%ld,%ld] += A[%ld,%ld] o B[%ld,%ld]\n", ib, jb, ib, kb, kb, jb);
#endif
// work on a single block``
for (size_t i = ib; i < (ib + bf); ++i)
for (size_t k = kb; k < (kb + bf); ++k)
for (size_t j = jb; j < (jb + bf); ++j)
C[i * n + j] += A[i * n + k] * B[k * n + j];
}
}
}
}
void init_mat(float * C, float * A, float * B, size_t n) {
for (size_t i = 0; i < n; ++i) {
for (size_t j = 0; j < n; ++j) {
C[i * n + j] = 0.0;
A[i * n + j] = 0.5;
B[i * n + j] = 0.25;
}
}
}
void dump_mat(float * mtx, size_t n) {
for (size_t i = 0; i < n; ++i) {
for (size_t j = 0; j < n; ++j) {
printf("%f ", mtx[i * n + j]);
}
printf("\n");
}
}
float sum_mat(float * mtx, size_t n) {
float sum = 0.0;
for (size_t i = 0; i < n; ++i) {
for (size_t j = 0; j < n; ++j) {
sum += mtx[i * n + j];
}
}
return sum;
}
int main(int argc, char * argv[]) {
double ts, te;
double t_seq;
float * C;
float * A;
float * B;
// If number of arguments is not 1, print help
if (argc != 2) {
printf("%s: matrix_size\n", argv[0]);
return EXIT_FAILURE;
}
const int n = atoi(argv[1]); // matrix size
C = (float *) malloc(sizeof(*C) * n * n);
A = (float *) malloc(sizeof(*A) * n * n);
B = (float *) malloc(sizeof(*B) * n * n);
init_mat(C, A, B, n);
ts = omp_get_wtime();
matmul_seq(C, A, B, n);
te = omp_get_wtime();
#if DUMP_MATRIX
dump_mat(C, n);
#endif
t_seq = te - ts;
printf("Sum of matrix (serial): %f, wall time %lf, speed-up %.2lf\n",
sum_mat(C, n), (te-ts), t_seq / (te-ts));
init_mat(C, A, B, n);
ts = omp_get_wtime();
matmul_par(C, A, B, n);
te = omp_get_wtime();
#if DUMP_MATRIX
dump_mat(C, n);
#endif
printf("Sum of matrix (parallel): %f, wall time %lf, speed-up %.2lf\n",
sum_mat(C, n), (te-ts), t_seq / (te-ts));
init_mat(C, A, B, n);
ts = omp_get_wtime();
matmul_tloop(C, A, B, n);
te = omp_get_wtime();
#if DUMP_MATRIX
dump_mat(C, n);
#endif
printf("Sum of matrix (taskloop): %f, wall time %lf, speed-up %.2lf\n",
sum_mat(C, n), (te-ts), t_seq / (te-ts));
init_mat(C, A, B, n);
ts = omp_get_wtime();
matmul_task(C, A, B, n);
te = omp_get_wtime();
#if DUMP_MATRIX
dump_mat(C, n);
#endif
printf("Sum of matrix (tasks): %f, wall time %lf, speed-up %.2lf\n",
sum_mat(C, n), (te-ts), t_seq / (te-ts));
return EXIT_SUCCESS;
}

20
07-fibonacci/Makefile Normal file
View file

@ -0,0 +1,20 @@
PROG=fibonacci
CC=gcc
CCFLAGS=-O3 -fopenmp
LDFLAGS=-fopenmp
debug:
$(CC) $(CCFLAGS) -g -O0 ${PROG}.c -o ${PROG}.exe
release:
$(CC) $(CCFLAGS) -O3 ${PROG}.c -o ${PROG}.exe
solution:
$(CC) $(CCFLAGS) -O3 ${PROG}-solved.c -o ${PROG}-solved.exe
run:
./${PROG}.exe < input
clean:
rm -f ${PROG}.exe ${PROG}-solved.exe ${PROG}.o *~

View file

@ -0,0 +1,79 @@
/*
* OpenMP lecture exercises
* Copyright (C) 2011 by Christian Terboven <terboven@rz.rwth-aachen.de>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include <stdio.h>
#include <omp.h>
int ser_fib(int n)
{
int x, y;
if (n < 2)
return n;
x = ser_fib(n - 1);
y = ser_fib(n - 2);
return x+y;
}
int fib(int n)
{
int x, y;
if (n < 2)
return n;
else if (n < 30)
return ser_fib(n);
#pragma omp task shared(x)
{
x = fib(n - 1);
}
#pragma omp task shared(y)
{
y = fib(n - 2);
}
#pragma omp taskwait
return x+y;
}
int main()
{
int n,fibonacci;
double starttime;
printf("Please insert n, to calculate fib(n): \n");
scanf("%d",&n);
starttime=omp_get_wtime();
#pragma omp parallel
#pragma omp single
{
fibonacci=fib(n);
}
printf("fib(%d)=%d \n",n,fibonacci);
printf("calculation took %lf sec\n",omp_get_wtime()-starttime);
return 0;
}

53
07-fibonacci/fibonacci.c Normal file
View file

@ -0,0 +1,53 @@
/*
* OpenMP lecture exercises
* Copyright (C) 2011 by Christian Terboven <terboven@rz.rwth-aachen.de>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include <stdio.h>
#include <omp.h>
int fib(int n)
{
int x, y;
if (n < 2)
return n;
x = fib(n - 1);
y = fib(n - 2);
return x+y;
}
int main()
{
int n,fibonacci;
double starttime;
printf("Please insert n, to calculate fib(n): \n");
scanf("%d",&n);
starttime=omp_get_wtime();
fibonacci=fib(n);
printf("fib(%d)=%d \n",n,fibonacci);
printf("calculation took %lf sec\n",omp_get_wtime()-starttime);
return 0;
}

1
07-fibonacci/input Normal file
View file

@ -0,0 +1 @@
35

20
08-quicksort/Makefile Normal file
View file

@ -0,0 +1,20 @@
PROG=quicksort
CXX = g++
CXXFLAGS=-O3 -fopenmp
LDFLAGS=-fopenmp
debug:
$(CXX) $(CXXFLAGS) -g -O0 ${PROG}.cpp -o ${PROG}.exe
release:
$(CXX) $(CXXFLAGS) -O3 ${PROG}.cpp -o ${PROG}.exe
solution:
$(CXX) $(CXXFLAGS) -O3 ${PROG}-solved.cpp -o ${PROG}-solved.exe
run:
./${PROG}.exe
clean:
rm -f ${PROG}.exe ${PROG}-solved.exe ${PROG}.o *~

View file

@ -0,0 +1,98 @@
#include <iostream>
#include <cstdlib>
#include <ctime>
#include <omp.h>
#include <stdio.h>
using namespace std;
void init(int* array,int length){
srand((unsigned)time(0));
for(int i=0;i<length;i++)
array[i]=rand()%1234 + 1;
}
void swap(int &a, int &b){
int temp=a;
a=b;
b=temp;
}
int pivot(int* array, int first, int last){
int p = first;
int pivotElement = array[first];
for(int i=first+1;i<=last;i++){
if(array[i]<=pivotElement){
p++;
swap(array[i],array[p]);
}
}
swap(array[p],array[first]);
return p;
}
void serial_quicksort(int * array, int first, int last){
int pivotElement;
if(first<last){
pivotElement = pivot(array,first,last);
serial_quicksort(array,first,pivotElement-1);
serial_quicksort(array,pivotElement+1,last);
}
}
void quicksort(int * array, int first, int last){
int pivotElement;
if((last - first + 1) < 10000) {
serial_quicksort(array, first, last);
} else {
pivotElement = pivot(array,first,last);
#pragma omp task default(shared)
{
quicksort(array,first,pivotElement-1);
}
#pragma omp task default(shared)
{
quicksort(array,pivotElement+1,last);
}
#pragma omp taskwait
}
}
bool checkFn(int * array,int length){
for(int i=0;i<length-1;i++){
if(array[i]>array[i+1]){
cout << "array[" << i << "] > array[" << i+1 << "]" << endl;
return false;
}
}
return true;
}
int main(int argc, char *argv[]){
int length;
int *toBeSorted;
double t1,t2;
if(argc < 2){
length=6000000;
}else{
length=atoi(argv[1]);
}
toBeSorted = new int[length];
init(toBeSorted, length);
cout << "Sorting an array of " << length << " elements." << endl;
t1=omp_get_wtime();
#pragma omp parallel
#pragma omp single
{
quicksort(toBeSorted,0,length-1);
}
t2=omp_get_wtime()-t1;
cout << "quicksort took " << t2 << " sec. to complete" << endl;
if (!checkFn(toBeSorted, length)) {
cout << "validation failed!" << endl;
}
delete [] toBeSorted;
}

View file

@ -0,0 +1,76 @@
#include <iostream>
#include <cstdlib>
#include <ctime>
#include <omp.h>
#include <stdio.h>
using namespace std;
void init(int* array,int length){
srand((unsigned)time(0));
for(int i=0;i<length;i++)
array[i]=rand()%1234 + 1;
}
void swap(int &a, int &b){
int temp=a;
a=b;
b=temp;
}
int pivot(int* array, int first, int last){
int p = first;
int pivotElement = array[first];
for(int i=first+1;i<=last;i++){
if(array[i]<=pivotElement){
p++;
swap(array[i],array[p]);
}
}
swap(array[p],array[first]);
return p;
}
void quicksort(int * array, int first, int last){
int pivotElement;
if(first<last){
pivotElement = pivot(array,first,last);
quicksort(array,first,pivotElement-1);
quicksort(array,pivotElement+1,last);
}
}
bool checkFn(int * array,int length){
for(int i=0;i<length-1;i++){
if(array[i]>array[i+1]){
cout << "array[" << i << "] > array[" << i+1 << "]" << endl;
return false;
}
}
return true;
}
int main(int argc, char *argv[]){
int length;
int *toBeSorted;
double t1,t2;
if(argc < 2){
length=6000000;
}else{
length=atoi(argv[1]);
}
toBeSorted = new int[length];
init(toBeSorted, length);
cout << "Sorting an array of " << length << " elements." << endl;
t1=omp_get_wtime();
quicksort(toBeSorted,0,length-1);
t2=omp_get_wtime()-t1;
cout << "quicksort took " << t2 << " sec. to complete" << endl;
if (!checkFn(toBeSorted, length)) {
cout << "validation failed!" << endl;
}
delete [] toBeSorted;
}

31
Makefile Normal file
View file

@ -0,0 +1,31 @@
ALL_RECURSIVE=all-recursive clean-recursive
TUT=SC19
DIRS=$(shell ls -d *-* | sed '/_/d' 2>/dev/null)
all: $(CONFIG) all-recursive
clean: $(CONFIG) clean-recursive
$(ALL_RECURSIVE):
@failcom='exit 1';\
target=`echo $@ | sed s/-recursive//`; \
for subdir in $(DIRS); do \
(cd $$subdir && $(MAKE) $$target) || eval $$failcom; \
done;
dist:
@failcom='exit 1';\
for subdir in $(DIRS); do \
(cd $$subdir && $(MAKE) clean) || eval $$failcom; \
done; \
tar --exclude="*-solved*" -cvzf ../mt-openmp-$(TUT).tar.gz --transform s/mt-openmp/mt-openmp-$(TUT)/ ../mt-openmp/*
dist-solved:
@failcom='exit 1';\
for subdir in $(DIRS); do \
(cd $$subdir && $(MAKE) clean) || eval $$failcom; \
done; \
tar -cvzf ../mt-openmp-$(TUT)-solved.tar.gz --transform s/mt-openmp/mt-openmp-$(TUT)-solved/ ../mt-openmp/*

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.