commit 8307cf55bff6979a92ec0870f85357f91da6ae68 Author: Jose Gracia Date: Fri Apr 26 15:47:20 2024 +0200 Exercises and slides diff --git a/00-worksharing/Makefile b/00-worksharing/Makefile new file mode 100644 index 0000000..9d1cee5 --- /dev/null +++ b/00-worksharing/Makefile @@ -0,0 +1,14 @@ +ALL_RECURSIVE=all-recursive clean-recursive + +DIRS=$(shell ls -d *-* | sed '/_/d' 2>/dev/null) + +all: $(CONFIG) all-recursive +clean: $(CONFIG) clean-recursive + +$(ALL_RECURSIVE): + @failcom='exit 1';\ + target=`echo $@ | sed s/-recursive//`; \ + for subdir in $(DIRS); do \ + (cd $$subdir && $(MAKE) $$target) || eval $$failcom; \ + done; + diff --git a/00-worksharing/bin/.gitdir b/00-worksharing/bin/.gitdir new file mode 100644 index 0000000..e69de29 diff --git a/00-worksharing/bin/run-cholesky.sh b/00-worksharing/bin/run-cholesky.sh new file mode 100755 index 0000000..ebcbc14 --- /dev/null +++ b/00-worksharing/bin/run-cholesky.sh @@ -0,0 +1,30 @@ +#!/bin/bash +#SBATCH --job-name=cholesky +#SBATCH --workdir=. +#SBATCH --output=cholesky_%j.out +#SBATCH --error=cholesky_%j.err +#SBATCH --cpus-per-task=48 +#SBATCH --ntasks=1 +#SBATCH --time=00:30:00 +#SBATCH --qos=debug + +export IFS=";" + +THREADS="01;02;04;06;08;10;12;14;16;18;20;22;24;26;28;30;32;34;36;38;40;42;44;46;48" +#THREADS="01;02;04;06;08;10;12" + +#MSIZES="2048;4096;8192" +MSIZES="8192" +MSIZES="16384" +MSIZES="2048" + +BSIZES="256" + +for MS in $MSIZES; do + for BS in $BSIZES; do + for threads in $THREADS; do + OMP_NUM_THREADS=$threads ./cholesky-for $MS $BS 0 + OMP_NUM_THREADS=$threads ./cholesky-for-opt $MS $BS 0 + done + done +done diff --git a/00-worksharing/cholesky-for-opt/Makefile b/00-worksharing/cholesky-for-opt/Makefile new file mode 100644 index 0000000..2ec6310 --- /dev/null +++ b/00-worksharing/cholesky-for-opt/Makefile @@ -0,0 +1,23 @@ +BIN_DIR=../bin +PROGRAM=$(BIN_DIR)/cholesky-for-opt + +MKL_HOME=$(MKLROOT) + +CC=gcc +CFLAGS=-O3 -std=gnu99 -I$(MKL_HOME)/include -fopenmp +LDFLAGS=-L$(MKL_HOME)/lib/intel64 -lmkl_sequential -lmkl_core -lmkl_rt -lpthread -lm -fopenmp +#VERBOSE=-DVERBOSE + +all: $(PROGRAM) + +$(PROGRAM): cholesky.c + $(CC) $(CFLAGS) $(VERBOSE) -o $@ $^ $(LDFLAGS) + +$(BIN_DIR): + mkdir $@ + +clean: + rm -rf $(PROGRAM) *.o + +wipe: clean + rm -rf *.out *.err diff --git a/00-worksharing/cholesky-for-opt/cholesky.c b/00-worksharing/cholesky-for-opt/cholesky.c new file mode 100644 index 0000000..e3bd336 --- /dev/null +++ b/00-worksharing/cholesky-for-opt/cholesky.c @@ -0,0 +1,289 @@ +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include "omp.h" + +#if !defined(_OPENMP) + int omp_get_max_threads() { return 1; } + int omp_get_num_threads() { return 1; } +#endif + +void cholesky(int ts, int nt, double* Ah[nt][nt]) +{ +#ifdef VERBOSE + printf("> Computing Cholesky Factorization: indirect blocked matrix...\n"); +#endif + + for (int k = 0; k < nt; k++) { + // Diagonal Block factorization: using LAPACK + LAPACKE_dpotrf(LAPACK_COL_MAJOR, 'L', ts, Ah[k][k], ts); + + // Triangular systems + for (int i = k + 1; i < nt; i++) { + cblas_dtrsm(CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, ts, ts, 1.0, Ah[k][k], ts, Ah[k][i], ts); + } + + // Update trailing matrix + for (int i = k + 1; i < nt; i++) { + for (int j = k + 1; j < i; j++) { + cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, ts, ts, ts, -1.0, Ah[k][i], ts, Ah[k][j], ts, 1.0, Ah[j][i], ts); + } + cblas_dsyrk(CblasColMajor, CblasLower, CblasNoTrans, ts, ts, -1.0, Ah[k][i], ts, 1.0, Ah[i][i], ts); + } + } + +#ifdef VERBOSE + printf("> ...end of Cholesky Factorization.\n"); +#endif +} + +float get_time() +{ + static double gtod_ref_time_sec = 0.0; + + struct timeval tv; + gettimeofday(&tv, NULL); + + // If this is the first invocation of through dclock(), then initialize the + // "reference time" global variable to the seconds field of the tv struct. + if (gtod_ref_time_sec == 0.0) gtod_ref_time_sec = (double) tv.tv_sec; + + // Normalize the seconds field of the tv struct so that it is relative to the + // "reference time" that was recorded during the first invocation of dclock(). + const double norm_sec = (double) tv.tv_sec - gtod_ref_time_sec; + + // Compute the number of seconds since the reference time. + const double t = norm_sec + tv.tv_usec * 1.0e-6; + + return (float) t; +} + +// Robust Check the factorization of the matrix A2 +// Using directly Fortran services: dlacpy_, dtrmm, dlange_ +static int check_factorization(int N, double *A1, double *A2, int LDA, char uplo, double eps) +{ +#ifdef VERBOSE + printf("> Checking the Cholesky Factorization... \n"); +#endif + char NORM = 'I', ALL = 'A', UP = 'U', LO = 'L', TR = 'T', NU = 'N', RI = 'R'; + + double *Residual = (double *) malloc(N*N*sizeof(double)); + double *L1 = (double *) malloc(N*N*sizeof(double)); + double *L2 = (double *) malloc(N*N*sizeof(double)); + double *work = (double *) malloc(N*sizeof(double)); + + memset((void*)L1, 0, N*N*sizeof(double)); + memset((void*)L2, 0, N*N*sizeof(double)); + + double alpha= 1.0; + + dlacpy_(&ALL, &N, &N, A1, &LDA, Residual, &N); + + /* Dealing with L'L or U'U */ + if (uplo == 'U'){ + dlacpy_(&UP, &N, &N, A2, &LDA, L1, &N); + dlacpy_(&UP, &N, &N, A2, &LDA, L2, &N); + dtrmm(&LO, &uplo, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N); + } else{ + dlacpy_(&LO, &N, &N, A2, &LDA, L1, &N); + dlacpy_(&LO, &N, &N, A2, &LDA, L2, &N); + dtrmm(&RI, &LO, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N); + } + + /* Compute the Residual || A -L'L|| */ + for (int i = 0; i < N; i++) + for (int j = 0; j < N; j++) + Residual[j*N+i] = L2[j*N+i] - Residual[j*N+i]; + + double Rnorm = dlange_(&NORM, &N, &N, Residual, &N, work); + double Anorm = dlange_(&NORM, &N, &N, A1, &N, work); + +#ifdef VERBOSE + printf("> - ||L'L-A||_oo/(||A||_oo.N.eps) = %e \n",Rnorm / (Anorm*N*eps)); +#endif + + const int info_factorization = isnan(Rnorm/(Anorm*N*eps)) || isinf(Rnorm/(Anorm*N*eps)) || (Rnorm/(Anorm*N*eps) > 60.0); + +#ifdef VERBOSE + if ( info_factorization) printf("> - Factorization is suspicious!\n"); + else printf("> - Factorization is CORRECT!\n"); +#endif + + free(Residual); free(L1); free(L2); free(work); + + return info_factorization; +} + +void initialize_matrix(const int n, const int ts, double *matrix) +{ +#ifdef VERBOSE + printf("> Initializing matrix with random values...\n"); +#endif + + int ISEED[4] = {0,0,0,1}; + int intONE=1; + + for (int i = 0; i < n*n; i+=n) { + dlarnv_(&intONE, &ISEED[0], &n, &matrix[i]); + } + + for (int i=0; i Converting linear matrix to blocks...\n"); +#endif + for (int i = 0; i < DIM; i++) + for (int j = 0; j < DIM; j++) { + gather_block ( N, ts, &Alin[i*ts][j*ts], A[i][j]); + } +} + +void convert_to_linear(const int ts, const int DIM, const int N, double *A[DIM][DIM], double Alin[N][N]) +{ +#ifdef VERBOSE + printf("> Converting blocked matrix to linear...\n"); +#endif + for (int i = 0; i < DIM; i++) + for (int j = 0; j < DIM; j++) { + scatter_block ( N, ts, A[i][j], (double *) &Alin[i*ts][j*ts]); + } +} + +int main(int argc, char* argv[]) +{ + char *result[3] = {"n/a","pass","FAIL"}; + const double eps = pow(2.0, -53); + + // If number of arguments is not 4, print help + if ( argc != 4) { + printf( "%s: matrix_size block_size check[0|1]?\n", argv[0] ); + exit( -1 ); + } + + const int n = atoi(argv[1]); // matrix size + const int ts = atoi(argv[2]); // tile size + int check = atoi(argv[3]); // check result? + + // Compute number of tiles + const int nt = n / ts; + assert((nt*ts) == n); // tile size should divide size + + // Allocate matrix + double * const matrix = (double *) malloc(n * n * sizeof(double)); + assert(matrix != NULL); + + // Initialize matrix + initialize_matrix(n, ts, matrix); + + // Allocate original matrix, and duplicate it, for debugging purposes + double * const original_matrix = (double *) malloc(n * n * sizeof(double)); + assert(original_matrix != NULL); + + // Save a copy of the original matrix + for (int i = 0; i < n * n; i++ ) { + original_matrix[i] = matrix[i]; + } + + // Set version description: Indirect blocked matrix + const char *version = "I-Blocked matrix"; + + // Allocate blocked matrix + double *Ah[nt][nt]; + for (int i = 0; i < nt; i++) { + for (int j = 0; j < nt; j++) { + Ah[i][j] = malloc(ts * ts * sizeof(double)); + assert(Ah[i][j] != NULL); + } + } + + // --------------------------------------- + // Convert, compute (time), and re-convert + // --------------------------------------- + convert_to_blocks(ts, nt, n, (double(*)[n]) matrix, Ah); + const float tref = get_time(); + cholesky(ts, nt, (double* (*)[nt]) Ah); + const float time = get_time() - tref; + convert_to_linear(ts, nt, n, Ah, (double (*)[n]) matrix); + + // Free blocked matrix + for (int i = 0; i < nt; i++) { + for (int j = 0; j < nt; j++) { + assert(Ah[i][j] != NULL); + free(Ah[i][j]); + } + } + + // Check result, if requested + if ( check ) { + const char uplo = 'L'; + if ( check_factorization( n, original_matrix, matrix, n, uplo, eps) ) check++; + } + + // Free original matrix, not needed anymore + free(original_matrix); + + // Compute GFLOPs (Not verified) + float gflops = (((1.0 / 3.0) * n * n * n) / ((time) * 1.0e+9)); + + // Print results +#ifdef VERBOSE + printf( "\n" ); + printf( "============ CHOLESKY RESULTS ============\n" ); + printf( " test %s\n", argv[0]); + printf( " version %s\n", version); + printf( " matrix size: %dx%d\n", n, n); + printf( " tile size: %dx%d\n", ts, ts); + printf( " number of threads: %d\n", omp_get_max_threads()); + printf( " time (s): %f\n", time); + printf( " performance (gflops): %f\n", gflops); + printf( " check: %s\n", result[check]); + printf( "==========================================\n" ); +#else + printf("test, %s, version, %s, n, %d, ts, %d, num_threads, %d, gflops, %f, time, %f, check, %s\n", argv[0], version, n, ts, omp_get_max_threads(), gflops, time, result[check]); +#endif + + // Free matrix + free(matrix); + + return 0; +} + diff --git a/00-worksharing/cholesky-for-opt/solution/cholesky.c b/00-worksharing/cholesky-for-opt/solution/cholesky.c new file mode 100644 index 0000000..8eb7004 --- /dev/null +++ b/00-worksharing/cholesky-for-opt/solution/cholesky.c @@ -0,0 +1,293 @@ +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include "omp.h" + +#if !defined(_OPENMP) + int omp_get_max_threads() { return 1; } + int omp_get_num_threads() { return 1; } +#endif + +void cholesky(int ts, int nt, double* Ah[nt][nt]) +{ +#ifdef VERBOSE + printf("> Computing Cholesky Factorization: indirect blocked matrix...\n"); +#endif + + for (int k = 0; k < nt; k++) { + // Diagonal Block factorization: using LAPACK + LAPACKE_dpotrf(LAPACK_COL_MAJOR, 'L', ts, Ah[k][k], ts); + + // Triangular systems + #pragma omp parallel for + for (int i = k + 1; i < nt; i++) { + cblas_dtrsm(CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, ts, ts, 1.0, Ah[k][k], ts, Ah[k][i], ts); + } + + // Update trailing matrix + #pragma omp parallel + for (int i = k + 1; i < nt; i++) { + #pragma omp for schedule(dynamic, 1) nowait + for (int j = k + 1; j < i; j++) { + cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, ts, ts, ts, -1.0, Ah[k][i], ts, Ah[k][j], ts, 1.0, Ah[j][i], ts); + } + #pragma omp single nowait + cblas_dsyrk(CblasColMajor, CblasLower, CblasNoTrans, ts, ts, -1.0, Ah[k][i], ts, 1.0, Ah[i][i], ts); + } + } + +#ifdef VERBOSE + printf("> ...end of Cholesky Factorization.\n"); +#endif +} + +float get_time() +{ + static double gtod_ref_time_sec = 0.0; + + struct timeval tv; + gettimeofday(&tv, NULL); + + // If this is the first invocation of through dclock(), then initialize the + // "reference time" global variable to the seconds field of the tv struct. + if (gtod_ref_time_sec == 0.0) gtod_ref_time_sec = (double) tv.tv_sec; + + // Normalize the seconds field of the tv struct so that it is relative to the + // "reference time" that was recorded during the first invocation of dclock(). + const double norm_sec = (double) tv.tv_sec - gtod_ref_time_sec; + + // Compute the number of seconds since the reference time. + const double t = norm_sec + tv.tv_usec * 1.0e-6; + + return (float) t; +} + +// Robust Check the factorization of the matrix A2 +// Using directly Fortran services: dlacpy_, dtrmm, dlange_ +static int check_factorization(int N, double *A1, double *A2, int LDA, char uplo, double eps) +{ +#ifdef VERBOSE + printf("> Checking the Cholesky Factorization... \n"); +#endif + char NORM = 'I', ALL = 'A', UP = 'U', LO = 'L', TR = 'T', NU = 'N', RI = 'R'; + + double *Residual = (double *) malloc(N*N*sizeof(double)); + double *L1 = (double *) malloc(N*N*sizeof(double)); + double *L2 = (double *) malloc(N*N*sizeof(double)); + double *work = (double *) malloc(N*sizeof(double)); + + memset((void*)L1, 0, N*N*sizeof(double)); + memset((void*)L2, 0, N*N*sizeof(double)); + + double alpha= 1.0; + + dlacpy_(&ALL, &N, &N, A1, &LDA, Residual, &N); + + /* Dealing with L'L or U'U */ + if (uplo == 'U'){ + dlacpy_(&UP, &N, &N, A2, &LDA, L1, &N); + dlacpy_(&UP, &N, &N, A2, &LDA, L2, &N); + dtrmm(&LO, &uplo, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N); + } else{ + dlacpy_(&LO, &N, &N, A2, &LDA, L1, &N); + dlacpy_(&LO, &N, &N, A2, &LDA, L2, &N); + dtrmm(&RI, &LO, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N); + } + + /* Compute the Residual || A -L'L|| */ + for (int i = 0; i < N; i++) + for (int j = 0; j < N; j++) + Residual[j*N+i] = L2[j*N+i] - Residual[j*N+i]; + + double Rnorm = dlange_(&NORM, &N, &N, Residual, &N, work); + double Anorm = dlange_(&NORM, &N, &N, A1, &N, work); + +#ifdef VERBOSE + printf("> - ||L'L-A||_oo/(||A||_oo.N.eps) = %e \n",Rnorm / (Anorm*N*eps)); +#endif + + const int info_factorization = isnan(Rnorm/(Anorm*N*eps)) || isinf(Rnorm/(Anorm*N*eps)) || (Rnorm/(Anorm*N*eps) > 60.0); + +#ifdef VERBOSE + if ( info_factorization) printf("> - Factorization is suspicious!\n"); + else printf("> - Factorization is CORRECT!\n"); +#endif + + free(Residual); free(L1); free(L2); free(work); + + return info_factorization; +} + +void initialize_matrix(const int n, const int ts, double *matrix) +{ +#ifdef VERBOSE + printf("> Initializing matrix with random values...\n"); +#endif + + int ISEED[4] = {0,0,0,1}; + int intONE=1; + + for (int i = 0; i < n*n; i+=n) { + dlarnv_(&intONE, &ISEED[0], &n, &matrix[i]); + } + + for (int i=0; i Converting linear matrix to blocks...\n"); +#endif + for (int i = 0; i < DIM; i++) + for (int j = 0; j < DIM; j++) { + gather_block ( N, ts, &Alin[i*ts][j*ts], A[i][j]); + } +} + +void convert_to_linear(const int ts, const int DIM, const int N, double *A[DIM][DIM], double Alin[N][N]) +{ +#ifdef VERBOSE + printf("> Converting blocked matrix to linear...\n"); +#endif + for (int i = 0; i < DIM; i++) + for (int j = 0; j < DIM; j++) { + scatter_block ( N, ts, A[i][j], (double *) &Alin[i*ts][j*ts]); + } +} + +int main(int argc, char* argv[]) +{ + char *result[3] = {"n/a","pass","FAIL"}; + const double eps = pow(2.0, -53); + + // If number of arguments is not 4, print help + if ( argc != 4) { + printf( "%s: matrix_size block_size check[0|1]?\n", argv[0] ); + exit( -1 ); + } + + const int n = atoi(argv[1]); // matrix size + const int ts = atoi(argv[2]); // tile size + int check = atoi(argv[3]); // check result? + + // Compute number of tiles + const int nt = n / ts; + assert((nt*ts) == n); // tile size should divide size + + // Allocate matrix + double * const matrix = (double *) malloc(n * n * sizeof(double)); + assert(matrix != NULL); + + // Initialize matrix + initialize_matrix(n, ts, matrix); + + // Allocate original matrix, and duplicate it, for debugging purposes + double * const original_matrix = (double *) malloc(n * n * sizeof(double)); + assert(original_matrix != NULL); + + // Save a copy of the original matrix + for (int i = 0; i < n * n; i++ ) { + original_matrix[i] = matrix[i]; + } + + // Set version description: Indirect blocked matrix + const char *version = "I-Blocked matrix"; + + // Allocate blocked matrix + double *Ah[nt][nt]; + for (int i = 0; i < nt; i++) { + for (int j = 0; j < nt; j++) { + Ah[i][j] = malloc(ts * ts * sizeof(double)); + assert(Ah[i][j] != NULL); + } + } + + // --------------------------------------- + // Convert, compute (time), and re-convert + // --------------------------------------- + convert_to_blocks(ts, nt, n, (double(*)[n]) matrix, Ah); + const float tref = get_time(); + cholesky(ts, nt, (double* (*)[nt]) Ah); + const float time = get_time() - tref; + convert_to_linear(ts, nt, n, Ah, (double (*)[n]) matrix); + + // Free blocked matrix + for (int i = 0; i < nt; i++) { + for (int j = 0; j < nt; j++) { + assert(Ah[i][j] != NULL); + free(Ah[i][j]); + } + } + + // Check result, if requested + if ( check ) { + const char uplo = 'L'; + if ( check_factorization( n, original_matrix, matrix, n, uplo, eps) ) check++; + } + + // Free original matrix, not needed anymore + free(original_matrix); + + // Compute GFLOPs (Not verified) + float gflops = (((1.0 / 3.0) * n * n * n) / ((time) * 1.0e+9)); + + // Print results +#ifdef VERBOSE + printf( "\n" ); + printf( "============ CHOLESKY RESULTS ============\n" ); + printf( " test %s\n", argv[0]); + printf( " version %s\n", version); + printf( " matrix size: %dx%d\n", n, n); + printf( " tile size: %dx%d\n", ts, ts); + printf( " number of threads: %d\n", omp_get_max_threads()); + printf( " time (s): %f\n", time); + printf( " performance (gflops): %f\n", gflops); + printf( " check: %s\n", result[check]); + printf( "==========================================\n" ); +#else + printf("test, %s, version, %s, n, %d, ts, %d, num_threads, %d, gflops, %f, time, %f, check, %s\n", argv[0], version, n, ts, omp_get_max_threads(), gflops, time, result[check]); +#endif + + // Free matrix + free(matrix); + + return 0; +} + diff --git a/00-worksharing/cholesky-for/Makefile b/00-worksharing/cholesky-for/Makefile new file mode 100644 index 0000000..4b5dfe1 --- /dev/null +++ b/00-worksharing/cholesky-for/Makefile @@ -0,0 +1,23 @@ +BIN_DIR=../bin +PROGRAM=$(BIN_DIR)/cholesky-for + +MKL_HOME=$(MKLROOT) + +CC=gcc +CFLAGS=-O3 -std=gnu99 -I$(MKL_HOME)/include -fopenmp +LDFLAGS=-L$(MKL_HOME)/lib/intel64 -lmkl_sequential -lmkl_core -lmkl_rt -lpthread -lm -fopenmp +#VERBOSE=-DVERBOSE + +all: $(PROGRAM) + +$(PROGRAM): cholesky.c + $(CC) $(CFLAGS) $(VERBOSE) -o $@ $^ $(LDFLAGS) + +$(BIN_DIR): + mkdir $@ + +clean: + rm -rf $(PROGRAM) *.o + +wipe: clean + rm -rf *.out *.err diff --git a/00-worksharing/cholesky-for/cholesky.c b/00-worksharing/cholesky-for/cholesky.c new file mode 100644 index 0000000..e3bd336 --- /dev/null +++ b/00-worksharing/cholesky-for/cholesky.c @@ -0,0 +1,289 @@ +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include "omp.h" + +#if !defined(_OPENMP) + int omp_get_max_threads() { return 1; } + int omp_get_num_threads() { return 1; } +#endif + +void cholesky(int ts, int nt, double* Ah[nt][nt]) +{ +#ifdef VERBOSE + printf("> Computing Cholesky Factorization: indirect blocked matrix...\n"); +#endif + + for (int k = 0; k < nt; k++) { + // Diagonal Block factorization: using LAPACK + LAPACKE_dpotrf(LAPACK_COL_MAJOR, 'L', ts, Ah[k][k], ts); + + // Triangular systems + for (int i = k + 1; i < nt; i++) { + cblas_dtrsm(CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, ts, ts, 1.0, Ah[k][k], ts, Ah[k][i], ts); + } + + // Update trailing matrix + for (int i = k + 1; i < nt; i++) { + for (int j = k + 1; j < i; j++) { + cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, ts, ts, ts, -1.0, Ah[k][i], ts, Ah[k][j], ts, 1.0, Ah[j][i], ts); + } + cblas_dsyrk(CblasColMajor, CblasLower, CblasNoTrans, ts, ts, -1.0, Ah[k][i], ts, 1.0, Ah[i][i], ts); + } + } + +#ifdef VERBOSE + printf("> ...end of Cholesky Factorization.\n"); +#endif +} + +float get_time() +{ + static double gtod_ref_time_sec = 0.0; + + struct timeval tv; + gettimeofday(&tv, NULL); + + // If this is the first invocation of through dclock(), then initialize the + // "reference time" global variable to the seconds field of the tv struct. + if (gtod_ref_time_sec == 0.0) gtod_ref_time_sec = (double) tv.tv_sec; + + // Normalize the seconds field of the tv struct so that it is relative to the + // "reference time" that was recorded during the first invocation of dclock(). + const double norm_sec = (double) tv.tv_sec - gtod_ref_time_sec; + + // Compute the number of seconds since the reference time. + const double t = norm_sec + tv.tv_usec * 1.0e-6; + + return (float) t; +} + +// Robust Check the factorization of the matrix A2 +// Using directly Fortran services: dlacpy_, dtrmm, dlange_ +static int check_factorization(int N, double *A1, double *A2, int LDA, char uplo, double eps) +{ +#ifdef VERBOSE + printf("> Checking the Cholesky Factorization... \n"); +#endif + char NORM = 'I', ALL = 'A', UP = 'U', LO = 'L', TR = 'T', NU = 'N', RI = 'R'; + + double *Residual = (double *) malloc(N*N*sizeof(double)); + double *L1 = (double *) malloc(N*N*sizeof(double)); + double *L2 = (double *) malloc(N*N*sizeof(double)); + double *work = (double *) malloc(N*sizeof(double)); + + memset((void*)L1, 0, N*N*sizeof(double)); + memset((void*)L2, 0, N*N*sizeof(double)); + + double alpha= 1.0; + + dlacpy_(&ALL, &N, &N, A1, &LDA, Residual, &N); + + /* Dealing with L'L or U'U */ + if (uplo == 'U'){ + dlacpy_(&UP, &N, &N, A2, &LDA, L1, &N); + dlacpy_(&UP, &N, &N, A2, &LDA, L2, &N); + dtrmm(&LO, &uplo, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N); + } else{ + dlacpy_(&LO, &N, &N, A2, &LDA, L1, &N); + dlacpy_(&LO, &N, &N, A2, &LDA, L2, &N); + dtrmm(&RI, &LO, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N); + } + + /* Compute the Residual || A -L'L|| */ + for (int i = 0; i < N; i++) + for (int j = 0; j < N; j++) + Residual[j*N+i] = L2[j*N+i] - Residual[j*N+i]; + + double Rnorm = dlange_(&NORM, &N, &N, Residual, &N, work); + double Anorm = dlange_(&NORM, &N, &N, A1, &N, work); + +#ifdef VERBOSE + printf("> - ||L'L-A||_oo/(||A||_oo.N.eps) = %e \n",Rnorm / (Anorm*N*eps)); +#endif + + const int info_factorization = isnan(Rnorm/(Anorm*N*eps)) || isinf(Rnorm/(Anorm*N*eps)) || (Rnorm/(Anorm*N*eps) > 60.0); + +#ifdef VERBOSE + if ( info_factorization) printf("> - Factorization is suspicious!\n"); + else printf("> - Factorization is CORRECT!\n"); +#endif + + free(Residual); free(L1); free(L2); free(work); + + return info_factorization; +} + +void initialize_matrix(const int n, const int ts, double *matrix) +{ +#ifdef VERBOSE + printf("> Initializing matrix with random values...\n"); +#endif + + int ISEED[4] = {0,0,0,1}; + int intONE=1; + + for (int i = 0; i < n*n; i+=n) { + dlarnv_(&intONE, &ISEED[0], &n, &matrix[i]); + } + + for (int i=0; i Converting linear matrix to blocks...\n"); +#endif + for (int i = 0; i < DIM; i++) + for (int j = 0; j < DIM; j++) { + gather_block ( N, ts, &Alin[i*ts][j*ts], A[i][j]); + } +} + +void convert_to_linear(const int ts, const int DIM, const int N, double *A[DIM][DIM], double Alin[N][N]) +{ +#ifdef VERBOSE + printf("> Converting blocked matrix to linear...\n"); +#endif + for (int i = 0; i < DIM; i++) + for (int j = 0; j < DIM; j++) { + scatter_block ( N, ts, A[i][j], (double *) &Alin[i*ts][j*ts]); + } +} + +int main(int argc, char* argv[]) +{ + char *result[3] = {"n/a","pass","FAIL"}; + const double eps = pow(2.0, -53); + + // If number of arguments is not 4, print help + if ( argc != 4) { + printf( "%s: matrix_size block_size check[0|1]?\n", argv[0] ); + exit( -1 ); + } + + const int n = atoi(argv[1]); // matrix size + const int ts = atoi(argv[2]); // tile size + int check = atoi(argv[3]); // check result? + + // Compute number of tiles + const int nt = n / ts; + assert((nt*ts) == n); // tile size should divide size + + // Allocate matrix + double * const matrix = (double *) malloc(n * n * sizeof(double)); + assert(matrix != NULL); + + // Initialize matrix + initialize_matrix(n, ts, matrix); + + // Allocate original matrix, and duplicate it, for debugging purposes + double * const original_matrix = (double *) malloc(n * n * sizeof(double)); + assert(original_matrix != NULL); + + // Save a copy of the original matrix + for (int i = 0; i < n * n; i++ ) { + original_matrix[i] = matrix[i]; + } + + // Set version description: Indirect blocked matrix + const char *version = "I-Blocked matrix"; + + // Allocate blocked matrix + double *Ah[nt][nt]; + for (int i = 0; i < nt; i++) { + for (int j = 0; j < nt; j++) { + Ah[i][j] = malloc(ts * ts * sizeof(double)); + assert(Ah[i][j] != NULL); + } + } + + // --------------------------------------- + // Convert, compute (time), and re-convert + // --------------------------------------- + convert_to_blocks(ts, nt, n, (double(*)[n]) matrix, Ah); + const float tref = get_time(); + cholesky(ts, nt, (double* (*)[nt]) Ah); + const float time = get_time() - tref; + convert_to_linear(ts, nt, n, Ah, (double (*)[n]) matrix); + + // Free blocked matrix + for (int i = 0; i < nt; i++) { + for (int j = 0; j < nt; j++) { + assert(Ah[i][j] != NULL); + free(Ah[i][j]); + } + } + + // Check result, if requested + if ( check ) { + const char uplo = 'L'; + if ( check_factorization( n, original_matrix, matrix, n, uplo, eps) ) check++; + } + + // Free original matrix, not needed anymore + free(original_matrix); + + // Compute GFLOPs (Not verified) + float gflops = (((1.0 / 3.0) * n * n * n) / ((time) * 1.0e+9)); + + // Print results +#ifdef VERBOSE + printf( "\n" ); + printf( "============ CHOLESKY RESULTS ============\n" ); + printf( " test %s\n", argv[0]); + printf( " version %s\n", version); + printf( " matrix size: %dx%d\n", n, n); + printf( " tile size: %dx%d\n", ts, ts); + printf( " number of threads: %d\n", omp_get_max_threads()); + printf( " time (s): %f\n", time); + printf( " performance (gflops): %f\n", gflops); + printf( " check: %s\n", result[check]); + printf( "==========================================\n" ); +#else + printf("test, %s, version, %s, n, %d, ts, %d, num_threads, %d, gflops, %f, time, %f, check, %s\n", argv[0], version, n, ts, omp_get_max_threads(), gflops, time, result[check]); +#endif + + // Free matrix + free(matrix); + + return 0; +} + diff --git a/00-worksharing/cholesky-for/solution/cholesky.c b/00-worksharing/cholesky-for/solution/cholesky.c new file mode 100644 index 0000000..270ca0e --- /dev/null +++ b/00-worksharing/cholesky-for/solution/cholesky.c @@ -0,0 +1,291 @@ +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include "omp.h" + +#if !defined(_OPENMP) + int omp_get_max_threads() { return 1; } + int omp_get_num_threads() { return 1; } +#endif + +void cholesky(int ts, int nt, double* Ah[nt][nt]) +{ +#ifdef VERBOSE + printf("> Computing Cholesky Factorization: indirect blocked matrix...\n"); +#endif + + for (int k = 0; k < nt; k++) { + // Diagonal Block factorization: using LAPACK + LAPACKE_dpotrf(LAPACK_COL_MAJOR, 'L', ts, Ah[k][k], ts); + + // Triangular systems + #pragma omp parallel for + for (int i = k + 1; i < nt; i++) { + cblas_dtrsm(CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, ts, ts, 1.0, Ah[k][k], ts, Ah[k][i], ts); + } + + // Update trailing matrix + #pragma omp parallel for + for (int i = k + 1; i < nt; i++) { + for (int j = k + 1; j < i; j++) { + cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, ts, ts, ts, -1.0, Ah[k][i], ts, Ah[k][j], ts, 1.0, Ah[j][i], ts); + } + cblas_dsyrk(CblasColMajor, CblasLower, CblasNoTrans, ts, ts, -1.0, Ah[k][i], ts, 1.0, Ah[i][i], ts); + } + } + +#ifdef VERBOSE + printf("> ...end of Cholesky Factorization.\n"); +#endif +} + +float get_time() +{ + static double gtod_ref_time_sec = 0.0; + + struct timeval tv; + gettimeofday(&tv, NULL); + + // If this is the first invocation of through dclock(), then initialize the + // "reference time" global variable to the seconds field of the tv struct. + if (gtod_ref_time_sec == 0.0) gtod_ref_time_sec = (double) tv.tv_sec; + + // Normalize the seconds field of the tv struct so that it is relative to the + // "reference time" that was recorded during the first invocation of dclock(). + const double norm_sec = (double) tv.tv_sec - gtod_ref_time_sec; + + // Compute the number of seconds since the reference time. + const double t = norm_sec + tv.tv_usec * 1.0e-6; + + return (float) t; +} + +// Robust Check the factorization of the matrix A2 +// Using directly Fortran services: dlacpy_, dtrmm, dlange_ +static int check_factorization(int N, double *A1, double *A2, int LDA, char uplo, double eps) +{ +#ifdef VERBOSE + printf("> Checking the Cholesky Factorization... \n"); +#endif + char NORM = 'I', ALL = 'A', UP = 'U', LO = 'L', TR = 'T', NU = 'N', RI = 'R'; + + double *Residual = (double *) malloc(N*N*sizeof(double)); + double *L1 = (double *) malloc(N*N*sizeof(double)); + double *L2 = (double *) malloc(N*N*sizeof(double)); + double *work = (double *) malloc(N*sizeof(double)); + + memset((void*)L1, 0, N*N*sizeof(double)); + memset((void*)L2, 0, N*N*sizeof(double)); + + double alpha= 1.0; + + dlacpy_(&ALL, &N, &N, A1, &LDA, Residual, &N); + + /* Dealing with L'L or U'U */ + if (uplo == 'U'){ + dlacpy_(&UP, &N, &N, A2, &LDA, L1, &N); + dlacpy_(&UP, &N, &N, A2, &LDA, L2, &N); + dtrmm(&LO, &uplo, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N); + } else{ + dlacpy_(&LO, &N, &N, A2, &LDA, L1, &N); + dlacpy_(&LO, &N, &N, A2, &LDA, L2, &N); + dtrmm(&RI, &LO, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N); + } + + /* Compute the Residual || A -L'L|| */ + for (int i = 0; i < N; i++) + for (int j = 0; j < N; j++) + Residual[j*N+i] = L2[j*N+i] - Residual[j*N+i]; + + double Rnorm = dlange_(&NORM, &N, &N, Residual, &N, work); + double Anorm = dlange_(&NORM, &N, &N, A1, &N, work); + +#ifdef VERBOSE + printf("> - ||L'L-A||_oo/(||A||_oo.N.eps) = %e \n",Rnorm / (Anorm*N*eps)); +#endif + + const int info_factorization = isnan(Rnorm/(Anorm*N*eps)) || isinf(Rnorm/(Anorm*N*eps)) || (Rnorm/(Anorm*N*eps) > 60.0); + +#ifdef VERBOSE + if ( info_factorization) printf("> - Factorization is suspicious!\n"); + else printf("> - Factorization is CORRECT!\n"); +#endif + + free(Residual); free(L1); free(L2); free(work); + + return info_factorization; +} + +void initialize_matrix(const int n, const int ts, double *matrix) +{ +#ifdef VERBOSE + printf("> Initializing matrix with random values...\n"); +#endif + + int ISEED[4] = {0,0,0,1}; + int intONE=1; + + for (int i = 0; i < n*n; i+=n) { + dlarnv_(&intONE, &ISEED[0], &n, &matrix[i]); + } + + for (int i=0; i Converting linear matrix to blocks...\n"); +#endif + for (int i = 0; i < DIM; i++) + for (int j = 0; j < DIM; j++) { + gather_block ( N, ts, &Alin[i*ts][j*ts], A[i][j]); + } +} + +void convert_to_linear(const int ts, const int DIM, const int N, double *A[DIM][DIM], double Alin[N][N]) +{ +#ifdef VERBOSE + printf("> Converting blocked matrix to linear...\n"); +#endif + for (int i = 0; i < DIM; i++) + for (int j = 0; j < DIM; j++) { + scatter_block ( N, ts, A[i][j], (double *) &Alin[i*ts][j*ts]); + } +} + +int main(int argc, char* argv[]) +{ + char *result[3] = {"n/a","pass","FAIL"}; + const double eps = pow(2.0, -53); + + // If number of arguments is not 4, print help + if ( argc != 4) { + printf( "%s: matrix_size block_size check[0|1]?\n", argv[0] ); + exit( -1 ); + } + + const int n = atoi(argv[1]); // matrix size + const int ts = atoi(argv[2]); // tile size + int check = atoi(argv[3]); // check result? + + // Compute number of tiles + const int nt = n / ts; + assert((nt*ts) == n); // tile size should divide size + + // Allocate matrix + double * const matrix = (double *) malloc(n * n * sizeof(double)); + assert(matrix != NULL); + + // Initialize matrix + initialize_matrix(n, ts, matrix); + + // Allocate original matrix, and duplicate it, for debugging purposes + double * const original_matrix = (double *) malloc(n * n * sizeof(double)); + assert(original_matrix != NULL); + + // Save a copy of the original matrix + for (int i = 0; i < n * n; i++ ) { + original_matrix[i] = matrix[i]; + } + + // Set version description: Indirect blocked matrix + const char *version = "I-Blocked matrix"; + + // Allocate blocked matrix + double *Ah[nt][nt]; + for (int i = 0; i < nt; i++) { + for (int j = 0; j < nt; j++) { + Ah[i][j] = malloc(ts * ts * sizeof(double)); + assert(Ah[i][j] != NULL); + } + } + + // --------------------------------------- + // Convert, compute (time), and re-convert + // --------------------------------------- + convert_to_blocks(ts, nt, n, (double(*)[n]) matrix, Ah); + const float tref = get_time(); + cholesky(ts, nt, (double* (*)[nt]) Ah); + const float time = get_time() - tref; + convert_to_linear(ts, nt, n, Ah, (double (*)[n]) matrix); + + // Free blocked matrix + for (int i = 0; i < nt; i++) { + for (int j = 0; j < nt; j++) { + assert(Ah[i][j] != NULL); + free(Ah[i][j]); + } + } + + // Check result, if requested + if ( check ) { + const char uplo = 'L'; + if ( check_factorization( n, original_matrix, matrix, n, uplo, eps) ) check++; + } + + // Free original matrix, not needed anymore + free(original_matrix); + + // Compute GFLOPs (Not verified) + float gflops = (((1.0 / 3.0) * n * n * n) / ((time) * 1.0e+9)); + + // Print results +#ifdef VERBOSE + printf( "\n" ); + printf( "============ CHOLESKY RESULTS ============\n" ); + printf( " test %s\n", argv[0]); + printf( " version %s\n", version); + printf( " matrix size: %dx%d\n", n, n); + printf( " tile size: %dx%d\n", ts, ts); + printf( " number of threads: %d\n", omp_get_max_threads()); + printf( " time (s): %f\n", time); + printf( " performance (gflops): %f\n", gflops); + printf( " check: %s\n", result[check]); + printf( "==========================================\n" ); +#else + printf("test, %s, version, %s, n, %d, ts, %d, num_threads, %d, gflops, %f, time, %f, check, %s\n", argv[0], version, n, ts, omp_get_max_threads(), gflops, time, result[check]); +#endif + + // Free matrix + free(matrix); + + return 0; +} + diff --git a/00-worksharing/matmul/Makefile b/00-worksharing/matmul/Makefile new file mode 100644 index 0000000..3bcffd8 --- /dev/null +++ b/00-worksharing/matmul/Makefile @@ -0,0 +1,23 @@ +BIN_DIR=../bin +PROGRAM=$(BIN_DIR)/matmul + +MKL_HOME=$(MKLROOT) + +CC=gcc +CFLAGS=-O3 -std=gnu99 -fopenmp +LDFLAGS=-fopenmp +#VERBOSE=-DVERBOSE + +all: $(PROGRAM) + +$(PROGRAM): matmul.c + $(CC) $(CFLAGS) $(VERBOSE) -o $@ $^ $(LDFLAGS) + +$(BIN_DIR): + mkdir $@ + +clean: + rm -rf $(PROGRAM) *.o + +wipe: clean + rm -rf *.out *.err diff --git a/00-worksharing/matmul/matmul.c b/00-worksharing/matmul/matmul.c new file mode 100644 index 0000000..5c619ab --- /dev/null +++ b/00-worksharing/matmul/matmul.c @@ -0,0 +1,106 @@ +//===-- matmul.c - Different implementations of matrix multiplies -*- C -*-===// +// +// Part of the LOMP Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include + +#define DUMP_MATRIX 0 + +void matmul_seq(double * C, double * A, double * B, size_t n) { + for (size_t i = 0; i < n; ++i) { + for (size_t k = 0; k < n; ++k) { + for (size_t j = 0; j < n; ++j) { + C[i * n + j] += A[i * n + k] * B[k * n + j]; + } + } + } +} + +void matmul_par(double * C, double * A, double * B, size_t n) { + for (size_t i = 0; i < n; ++i) { + for (size_t k = 0; k < n; ++k) { + for (size_t j = 0; j < n; ++j) { + C[i * n + j] += A[i * n + k] * B[k * n + j]; + } + } + } +} + +void init_mat(double * C, double * A, double * B, size_t n) { + for (size_t i = 0; i < n; ++i) { + for (size_t j = 0; j < n; ++j) { + C[i * n + j] = 0.0; + A[i * n + j] = 0.5; + B[i * n + j] = 0.25; + } + } +} + +void dump_mat(double * mtx, size_t n) { + for (size_t i = 0; i < n; ++i) { + for (size_t j = 0; j < n; ++j) { + printf("%f ", mtx[i * n + j]); + } + printf("\n"); + } +} + +double sum_mat(double * mtx, size_t n) { + double sum = 0.0; + for (size_t i = 0; i < n; ++i) { + for (size_t j = 0; j < n; ++j) { + sum += mtx[i * n + j]; + } + } + return sum; +} + +int main(int argc, char * argv[]) { + double ts, te; + double t_seq; + + double * C; + double * A; + double * B; + + // If number of arguments is not 1, print help + if (argc != 2) { + printf("%s: matrix_size\n", argv[0]); + return EXIT_FAILURE; + } + const int n = atoi(argv[1]); // matrix size + + C = (double *)malloc(sizeof(*C) * n * n); + A = (double *)malloc(sizeof(*A) * n * n); + B = (double *)malloc(sizeof(*B) * n * n); + + init_mat(C, A, B, n); + ts = omp_get_wtime(); + matmul_seq(C, A, B, n); + te = omp_get_wtime(); +#if DUMP_MATRIX + dump_mat(C, n); +#endif + t_seq = te - ts; + printf("Sum of matrix (serial): %f, wall time %lf, speed-up %.2lf\n", + sum_mat(C, n), (te - ts), t_seq / (te - ts)); + + init_mat(C, A, B, n); + ts = omp_get_wtime(); + matmul_par(C, A, B, n); + te = omp_get_wtime(); +#if DUMP_MATRIX + dump_mat(C, n); +#endif + printf("Sum of matrix (parallel): %f, wall time %lf, speed-up %.2lf\n", + sum_mat(C, n), (te - ts), t_seq / (te - ts)); + + return EXIT_SUCCESS; +} diff --git a/00-worksharing/matmul/solution/matmul.c b/00-worksharing/matmul/solution/matmul.c new file mode 100644 index 0000000..332c6b1 --- /dev/null +++ b/00-worksharing/matmul/solution/matmul.c @@ -0,0 +1,108 @@ +//===-- matmul.c - Different implementations of matrix multiplies -*- C -*-===// +// +// Part of the LOMP Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include + +#define DUMP_MATRIX 0 + +void matmul_seq(double * C, double * A, double * B, size_t n) { + for (size_t i = 0; i < n; ++i) { + for (size_t k = 0; k < n; ++k) { + for (size_t j = 0; j < n; ++j) { + C[i * n + j] += A[i * n + k] * B[k * n + j]; + } + } + } +} + +void matmul_par(double * C, double * A, double * B, size_t n) { +#pragma omp parallel for shared(A,B,C) firstprivate(n) \ + schedule(static) // collapse(2) + for (size_t i = 0; i < n; ++i) { + for (size_t k = 0; k < n; ++k) { + for (size_t j = 0; j < n; ++j) { + C[i * n + j] += A[i * n + k] * B[k * n + j]; + } + } + } +} + +void init_mat(double * C, double * A, double * B, size_t n) { + for (size_t i = 0; i < n; ++i) { + for (size_t j = 0; j < n; ++j) { + C[i * n + j] = 0.0; + A[i * n + j] = 0.5; + B[i * n + j] = 0.25; + } + } +} + +void dump_mat(double * mtx, size_t n) { + for (size_t i = 0; i < n; ++i) { + for (size_t j = 0; j < n; ++j) { + printf("%f ", mtx[i * n + j]); + } + printf("\n"); + } +} + +double sum_mat(double * mtx, size_t n) { + double sum = 0.0; + for (size_t i = 0; i < n; ++i) { + for (size_t j = 0; j < n; ++j) { + sum += mtx[i * n + j]; + } + } + return sum; +} + +int main(int argc, char * argv[]) { + double ts, te; + double t_seq; + + double * C; + double * A; + double * B; + + // If number of arguments is not 1, print help + if (argc != 2) { + printf("%s: matrix_size\n", argv[0]); + return EXIT_FAILURE; + } + const int n = atoi(argv[1]); // matrix size + + C = (double *)malloc(sizeof(*C) * n * n); + A = (double *)malloc(sizeof(*A) * n * n); + B = (double *)malloc(sizeof(*B) * n * n); + + init_mat(C, A, B, n); + ts = omp_get_wtime(); + matmul_seq(C, A, B, n); + te = omp_get_wtime(); +#if DUMP_MATRIX + dump_mat(C, n); +#endif + t_seq = te - ts; + printf("Sum of matrix (serial): %f, wall time %lf, speed-up %.2lf\n", + sum_mat(C, n), (te - ts), t_seq / (te - ts)); + + init_mat(C, A, B, n); + ts = omp_get_wtime(); + matmul_par(C, A, B, n); + te = omp_get_wtime(); +#if DUMP_MATRIX + dump_mat(C, n); +#endif + printf("Sum of matrix (parallel): %f, wall time %lf, speed-up %.2lf\n", + sum_mat(C, n), (te - ts), t_seq / (te - ts)); + + return EXIT_SUCCESS; +} diff --git a/00-worksharing/sin-cos/Makefile b/00-worksharing/sin-cos/Makefile new file mode 100644 index 0000000..e8a300c --- /dev/null +++ b/00-worksharing/sin-cos/Makefile @@ -0,0 +1,21 @@ +BIN_DIR=../bin +PROGRAM=$(BIN_DIR)/sin-cos + +CC=gcc +CFLAGS=-O3 -std=gnu99 -fopenmp +LDFLAGS=-lm -fopenmp +#VERBOSE=-DVERBOSE + +all: $(PROGRAM) + +$(PROGRAM): sin-cos.c + $(CC) $(CFLAGS) $(VERBOSE) -o $@ $^ $(LDFLAGS) + +$(BIN_DIR): + mkdir $@ + +clean: + rm -rf $(PROGRAM) *.o + +wipe: clean + rm -rf *.out *.err diff --git a/00-worksharing/sin-cos/sin-cos.c b/00-worksharing/sin-cos/sin-cos.c new file mode 100644 index 0000000..412b15f --- /dev/null +++ b/00-worksharing/sin-cos/sin-cos.c @@ -0,0 +1,58 @@ +/* + * OpenMP lecture exercises + * Copyright (C) 2011 by Christian Terboven + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + + +#include +#include +#include +#include + + +double do_some_computation(int i) +{ + double t = 0.0; + int j; + for (j = 0; j < i*i; j++) + { + t += sin((double)j) * cos((double)j); + } + return t; +} + + +int main(int argc, char* argv[]) +{ + const int dimension = 500; + int i; + double result = 0.0; + + double t1 = omp_get_wtime(); + + for (i = 0; i < dimension; i++) + { + result += do_some_computation(i); + } + + double t2 = omp_get_wtime(); + printf("Computation took %.3lf seconds.\n", t2 - t1); + printf("Result is %.3lf.\n", result); + + return 0; +} diff --git a/00-worksharing/sin-cos/solution/sin-cos.c b/00-worksharing/sin-cos/solution/sin-cos.c new file mode 100644 index 0000000..f493b79 --- /dev/null +++ b/00-worksharing/sin-cos/solution/sin-cos.c @@ -0,0 +1,59 @@ +/* + * OpenMP lecture exercises + * Copyright (C) 2011 by Christian Terboven + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + + +#include +#include +#include +#include + + +double do_some_computation(int i) +{ + double t = 0.0; + int j; + for (j = 0; j < i*i; j++) + { + t += sin((double)j) * cos((double)j); + } + return t; +} + + +int main(int argc, char* argv[]) +{ + const int dimension = 500; + int i; + double result = 0.0; + + double t1 = omp_get_wtime(); + + #pragma omp parallel for schedule(dynamic) reduction(+:result) + for (i = 0; i < dimension; i++) + { + result += do_some_computation(i); + } + + double t2 = omp_get_wtime(); + printf("Computation took %.3lf seconds.\n", t2 - t1); + printf("Result is %.3lf.\n", result); + + return 0; +} diff --git a/01-tasks/Makefile b/01-tasks/Makefile new file mode 100644 index 0000000..9d1cee5 --- /dev/null +++ b/01-tasks/Makefile @@ -0,0 +1,14 @@ +ALL_RECURSIVE=all-recursive clean-recursive + +DIRS=$(shell ls -d *-* | sed '/_/d' 2>/dev/null) + +all: $(CONFIG) all-recursive +clean: $(CONFIG) clean-recursive + +$(ALL_RECURSIVE): + @failcom='exit 1';\ + target=`echo $@ | sed s/-recursive//`; \ + for subdir in $(DIRS); do \ + (cd $$subdir && $(MAKE) $$target) || eval $$failcom; \ + done; + diff --git a/01-tasks/bin/.gitdir b/01-tasks/bin/.gitdir new file mode 100644 index 0000000..e69de29 diff --git a/01-tasks/bin/run-cholesky.sh b/01-tasks/bin/run-cholesky.sh new file mode 100755 index 0000000..e411b7c --- /dev/null +++ b/01-tasks/bin/run-cholesky.sh @@ -0,0 +1,28 @@ +#!/bin/bash +#SBATCH --job-name=cholesky +#SBATCH --workdir=. +#SBATCH --output=cholesky_%j.out +#SBATCH --error=cholesky_%j.err +#SBATCH --cpus-per-task=48 +#SBATCH --ntasks=1 +#SBATCH --time=00:30:00 +#SBATCH --qos=debug + +export IFS=";" + +THREADS="01;02;04;06;08;10;12;14;16;18;20;22;24;26;28;30;32;34;36;38;40;42;44;46;48" +#THREADS="01;02;04;06;08;10;12" + +#MSIZES="2048;4096;8192;16384" +MSIZES="4096" + +BSIZES="512" + +for MS in $MSIZES; do + for BS in $BSIZES; do + for threads in $THREADS; do + OMP_NUM_THREADS=$threads ./cholesky-tsk $MS $BS 0 + done + done +done + diff --git a/01-tasks/bin/run-sudoku.sh b/01-tasks/bin/run-sudoku.sh new file mode 100755 index 0000000..e0a3e0b --- /dev/null +++ b/01-tasks/bin/run-sudoku.sh @@ -0,0 +1,20 @@ +#!/bin/bash +#SBATCH --job-name=sudoku +#SBATCH --workdir=. +#SBATCH --output=sudoku_%j.out +#SBATCH --error=sudoku_%j.err +#SBATCH --cpus-per-task=48 +#SBATCH --ntasks=1 +#SBATCH --time=00:30:00 +#SBATCH --qos=debug + +export IFS=";" + +THREADS="01;02;04;06;08;10;12;14;16;18;20;22;24;26;28;30;32;34;36;38;40;42;44;46;48" +#THREADS="01;02;04;06;08;10;12" + +for threads in $THREADS; do + OMP_NUM_THREADS=$threads ./sudoku-tsk 9 3 sudoku-9x9-1.txt + #OMP_NUM_THREADS=$threads ./sudoku-tsk 16 4 sudoku-16x16-1.txt + #OMP_NUM_THREADS=$threads ./sudoku-tsk 25 5 sudoku-25x25-1.txt +done diff --git a/01-tasks/bin/sudoku-16x16-1.txt b/01-tasks/bin/sudoku-16x16-1.txt new file mode 100644 index 0000000..b3eb632 --- /dev/null +++ b/01-tasks/bin/sudoku-16x16-1.txt @@ -0,0 +1,16 @@ +0 6 0 0 0 0 0 8 11 0 0 15 14 0 0 16 +15 11 0 0 0 16 14 0 0 0 12 0 0 6 0 0 +13 0 9 12 0 0 0 0 3 16 14 0 15 11 10 0 +2 0 16 0 11 0 15 10 1 0 0 0 0 0 0 0 +0 15 11 10 0 0 16 2 13 8 9 12 0 0 0 0 +12 13 0 0 4 1 5 6 2 3 0 0 0 0 11 10 +5 0 6 1 12 0 9 0 15 11 10 7 16 0 0 3 +0 2 0 0 0 10 0 11 6 0 5 0 0 13 0 9 +10 7 15 11 16 0 0 0 12 13 0 0 0 0 0 6 +9 0 0 0 0 0 1 0 0 2 0 16 10 0 0 11 +1 0 4 6 9 13 0 0 7 0 11 0 3 16 0 0 +16 14 0 0 7 0 10 15 4 6 1 0 0 0 13 8 +11 10 0 15 0 0 0 16 9 12 13 0 0 1 5 4 +0 0 12 0 1 4 6 0 16 0 0 0 11 10 0 0 +0 0 5 0 8 12 13 0 10 0 0 11 2 0 0 14 +3 16 0 0 10 0 0 7 0 0 6 0 0 0 12 0 diff --git a/01-tasks/bin/sudoku-25x25-1.txt b/01-tasks/bin/sudoku-25x25-1.txt new file mode 100644 index 0000000..dc1d28e --- /dev/null +++ b/01-tasks/bin/sudoku-25x25-1.txt @@ -0,0 +1,25 @@ +0 8 0 18 0 12 20 21 16 0 0 0 0 0 0 15 0 0 24 5 7 0 19 0 0 +0 0 15 9 5 4 0 23 0 18 12 20 0 21 16 7 0 0 0 2 1 0 0 0 0 +0 0 0 0 14 7 0 0 0 0 0 24 0 0 0 0 10 0 0 16 0 0 23 18 6 +21 20 12 0 16 0 0 25 14 22 7 0 0 19 2 4 0 23 8 6 0 24 0 0 0 +19 0 7 0 2 15 0 0 0 9 0 0 18 0 0 0 0 0 0 0 12 0 21 0 16 +0 23 6 0 0 16 0 0 0 20 14 25 11 0 0 5 24 7 3 9 2 19 1 0 17 +4 21 0 20 0 0 0 12 22 11 2 19 0 0 0 0 0 15 23 0 0 3 7 0 9 +12 25 14 0 22 0 19 0 0 13 0 0 24 7 0 0 0 0 21 10 6 0 15 8 18 +7 0 0 24 9 6 0 15 18 0 0 0 0 0 10 0 13 1 19 0 0 25 0 11 22 +0 19 2 13 17 5 0 0 9 0 6 0 8 15 0 14 11 12 25 22 16 0 0 0 10 +0 0 0 1 0 0 0 0 3 7 8 0 15 0 0 11 12 10 0 0 0 0 18 4 0 +10 16 11 12 25 0 14 22 19 1 0 0 0 17 3 20 0 0 6 21 8 0 0 0 0 +0 0 8 0 23 20 0 18 21 4 11 16 0 0 25 0 7 0 2 0 0 14 22 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 22 19 8 15 0 5 23 24 0 17 0 3 +0 2 0 7 3 8 0 9 23 15 20 0 4 0 21 13 1 0 14 19 0 0 10 12 25 +2 0 0 0 0 18 15 5 0 23 0 0 21 6 20 17 0 14 1 13 0 12 0 25 11 +0 1 0 19 13 9 7 0 24 0 0 0 23 0 8 0 25 0 0 11 0 0 0 21 0 +0 0 10 0 0 0 0 0 0 0 17 1 0 14 0 18 0 5 15 8 9 0 0 0 0 +16 12 0 0 11 0 0 14 0 0 0 0 3 2 24 0 21 0 4 20 0 15 0 0 8 +0 0 0 0 0 10 0 0 0 0 0 12 0 0 11 9 3 0 0 0 17 0 14 19 13 +24 0 0 0 15 21 18 0 4 0 0 0 16 20 0 3 2 13 0 7 19 22 0 0 1 +0 18 0 0 0 0 10 0 0 16 0 22 0 0 0 23 0 24 9 0 3 17 0 0 0 +0 0 0 0 0 0 0 0 15 5 0 0 6 8 4 0 14 11 22 1 25 0 20 16 12 +0 22 19 14 1 3 0 0 0 0 23 9 5 0 15 25 0 20 10 12 21 0 8 0 4 +20 0 25 16 0 0 22 11 1 0 3 0 2 13 7 21 0 0 0 0 0 9 0 0 0 diff --git a/01-tasks/bin/sudoku-9x9-1.txt b/01-tasks/bin/sudoku-9x9-1.txt new file mode 100644 index 0000000..3101a00 --- /dev/null +++ b/01-tasks/bin/sudoku-9x9-1.txt @@ -0,0 +1,10 @@ +5 3 0 0 7 0 0 0 0 +6 0 0 1 9 5 3 0 0 +0 9 8 0 0 0 0 6 0 +8 0 0 0 6 0 0 0 3 +4 0 0 8 0 3 0 0 1 +7 0 0 0 2 0 0 0 6 +0 6 0 0 0 0 2 8 0 +0 0 0 4 1 9 0 0 5 +0 0 0 0 8 0 0 7 9 + diff --git a/01-tasks/cholesky-tsk/Makefile b/01-tasks/cholesky-tsk/Makefile new file mode 100644 index 0000000..fcbad34 --- /dev/null +++ b/01-tasks/cholesky-tsk/Makefile @@ -0,0 +1,23 @@ +BIN_DIR=../bin +PROGRAM=$(BIN_DIR)/cholesky-tsk + +MKL_HOME=$(MKLROOT) + +CC=gcc +CFLAGS=-O3 -std=gnu99 -I$(MKL_HOME)/include -fopenmp +LDFLAGS=-L$(MKL_HOME)/lib/intel64 -lmkl_sequential -lmkl_core -lmkl_rt -lpthread -lm -fopenmp +VERBOSE=-DVERBOSE + +all: $(PROGRAM) + +$(PROGRAM): cholesky.c + $(CC) $(CFLAGS) $(VERBOSE) -o $@ $^ $(LDFLAGS) + +$(BIN_DIR): + mkdir $@ + +clean: + rm -rf $(PROGRAM) *.o + +wipe: clean + rm -rf *.out *.err diff --git a/01-tasks/cholesky-tsk/cholesky-solved.c b/01-tasks/cholesky-tsk/cholesky-solved.c new file mode 100644 index 0000000..3014283 --- /dev/null +++ b/01-tasks/cholesky-tsk/cholesky-solved.c @@ -0,0 +1,296 @@ +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include "omp.h" + +#if !defined(_OPENMP) + int omp_get_max_threads() { return 1; } + int omp_get_num_threads() { return 1; } +#endif + +void cholesky(int ts, int nt, double* Ah[nt][nt]) +{ +#ifdef VERBOSE + printf("> Computing Cholesky Factorization: indirect blocked matrix...\n"); +#endif + + #pragma omp parallel + #pragma omp single + for (int k = 0; k < nt; k++) { + // Diagonal Block factorization: using LAPACK + LAPACKE_dpotrf(LAPACK_COL_MAJOR, 'L', ts, Ah[k][k], ts); + + // Triangular systems + for (int i = k + 1; i < nt; i++) { + #pragma omp task + cblas_dtrsm(CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, ts, ts, 1.0, Ah[k][k], ts, Ah[k][i], ts); + } + #pragma omp taskwait + + // Update trailing matrix + for (int i = k + 1; i < nt; i++) { + for (int j = k + 1; j < i; j++) { + #pragma omp task + cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, ts, ts, ts, -1.0, Ah[k][i], ts, Ah[k][j], ts, 1.0, Ah[j][i], ts); + } + #pragma omp task + cblas_dsyrk(CblasColMajor, CblasLower, CblasNoTrans, ts, ts, -1.0, Ah[k][i], ts, 1.0, Ah[i][i], ts); + } + #pragma omp taskwait + } + +#ifdef VERBOSE + printf("> ...end of Cholesky Factorization.\n"); +#endif +} + +float get_time() +{ + static double gtod_ref_time_sec = 0.0; + + struct timeval tv; + gettimeofday(&tv, NULL); + + // If this is the first invocation of through dclock(), then initialize the + // "reference time" global variable to the seconds field of the tv struct. + if (gtod_ref_time_sec == 0.0) gtod_ref_time_sec = (double) tv.tv_sec; + + // Normalize the seconds field of the tv struct so that it is relative to the + // "reference time" that was recorded during the first invocation of dclock(). + const double norm_sec = (double) tv.tv_sec - gtod_ref_time_sec; + + // Compute the number of seconds since the reference time. + const double t = norm_sec + tv.tv_usec * 1.0e-6; + + return (float) t; +} + +// Robust Check the factorization of the matrix A2 +// Using directly Fortran services: dlacpy_, dtrmm, dlange_ +static int check_factorization(int N, double *A1, double *A2, int LDA, char uplo, double eps) +{ +#ifdef VERBOSE + printf("> Checking the Cholesky Factorization... \n"); +#endif + char NORM = 'I', ALL = 'A', UP = 'U', LO = 'L', TR = 'T', NU = 'N', RI = 'R'; + + double *Residual = (double *) malloc(N*N*sizeof(double)); + double *L1 = (double *) malloc(N*N*sizeof(double)); + double *L2 = (double *) malloc(N*N*sizeof(double)); + double *work = (double *) malloc(N*sizeof(double)); + + memset((void*)L1, 0, N*N*sizeof(double)); + memset((void*)L2, 0, N*N*sizeof(double)); + + double alpha= 1.0; + + dlacpy_(&ALL, &N, &N, A1, &LDA, Residual, &N); + + /* Dealing with L'L or U'U */ + if (uplo == 'U'){ + dlacpy_(&UP, &N, &N, A2, &LDA, L1, &N); + dlacpy_(&UP, &N, &N, A2, &LDA, L2, &N); + dtrmm(&LO, &uplo, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N); + } else{ + dlacpy_(&LO, &N, &N, A2, &LDA, L1, &N); + dlacpy_(&LO, &N, &N, A2, &LDA, L2, &N); + dtrmm(&RI, &LO, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N); + } + + /* Compute the Residual || A -L'L|| */ + for (int i = 0; i < N; i++) + for (int j = 0; j < N; j++) + Residual[j*N+i] = L2[j*N+i] - Residual[j*N+i]; + + double Rnorm = dlange_(&NORM, &N, &N, Residual, &N, work); + double Anorm = dlange_(&NORM, &N, &N, A1, &N, work); + +#ifdef VERBOSE + printf("> - ||L'L-A||_oo/(||A||_oo.N.eps) = %e \n",Rnorm / (Anorm*N*eps)); +#endif + + const int info_factorization = isnan(Rnorm/(Anorm*N*eps)) || isinf(Rnorm/(Anorm*N*eps)) || (Rnorm/(Anorm*N*eps) > 60.0); + +#ifdef VERBOSE + if ( info_factorization) printf("> - Factorization is suspicious!\n"); + else printf("> - Factorization is CORRECT!\n"); +#endif + + free(Residual); free(L1); free(L2); free(work); + + return info_factorization; +} + +void initialize_matrix(const int n, const int ts, double *matrix) +{ +#ifdef VERBOSE + printf("> Initializing matrix with random values...\n"); +#endif + + int ISEED[4] = {0,0,0,1}; + int intONE=1; + + for (int i = 0; i < n*n; i+=n) { + dlarnv_(&intONE, &ISEED[0], &n, &matrix[i]); + } + + for (int i=0; i Converting linear matrix to blocks...\n"); +#endif + for (int i = 0; i < DIM; i++) + for (int j = 0; j < DIM; j++) { + gather_block ( N, ts, &Alin[i*ts][j*ts], A[i][j]); + } +} + +void convert_to_linear(const int ts, const int DIM, const int N, double *A[DIM][DIM], double Alin[N][N]) +{ +#ifdef VERBOSE + printf("> Converting blocked matrix to linear...\n"); +#endif + for (int i = 0; i < DIM; i++) + for (int j = 0; j < DIM; j++) { + scatter_block ( N, ts, A[i][j], (double *) &Alin[i*ts][j*ts]); + } +} + +int main(int argc, char* argv[]) +{ + char *result[3] = {"n/a","pass","FAIL"}; + const double eps = pow(2.0, -53); + + // If number of arguments is not 4, print help + if ( argc != 4) { + printf( "%s: matrix_size block_size check[0|1]?\n", argv[0] ); + exit( -1 ); + } + + const int n = atoi(argv[1]); // matrix size + const int ts = atoi(argv[2]); // tile size + int check = atoi(argv[3]); // check result? + + // Compute number of tiles + const int nt = n / ts; + assert((nt*ts) == n); // tile size should divide size + + // Allocate matrix + double * const matrix = (double *) malloc(n * n * sizeof(double)); + assert(matrix != NULL); + + // Initialize matrix + initialize_matrix(n, ts, matrix); + + // Allocate original matrix, and duplicate it, for debugging purposes + double * const original_matrix = (double *) malloc(n * n * sizeof(double)); + assert(original_matrix != NULL); + + // Save a copy of the original matrix + for (int i = 0; i < n * n; i++ ) { + original_matrix[i] = matrix[i]; + } + + // Set version description: Indirect blocked matrix + const char *version = "I-Blocked matrix"; + + // Allocate blocked matrix + double *Ah[nt][nt]; + for (int i = 0; i < nt; i++) { + for (int j = 0; j < nt; j++) { + Ah[i][j] = malloc(ts * ts * sizeof(double)); + assert(Ah[i][j] != NULL); + } + } + + // --------------------------------------- + // Convert, compute (time), and re-convert + // --------------------------------------- + convert_to_blocks(ts, nt, n, (double(*)[n]) matrix, Ah); + const float tref = get_time(); + cholesky(ts, nt, (double* (*)[nt]) Ah); + const float time = get_time() - tref; + convert_to_linear(ts, nt, n, Ah, (double (*)[n]) matrix); + + // Free blocked matrix + for (int i = 0; i < nt; i++) { + for (int j = 0; j < nt; j++) { + assert(Ah[i][j] != NULL); + free(Ah[i][j]); + } + } + + // Check result, if requested + if ( check ) { + const char uplo = 'L'; + if ( check_factorization( n, original_matrix, matrix, n, uplo, eps) ) check++; + } + + // Free original matrix, not needed anymore + free(original_matrix); + + // Compute GFLOPs (Not verified) + float gflops = (((1.0 / 3.0) * n * n * n) / ((time) * 1.0e+9)); + + // Print results +#ifdef VERBOSE + printf( "\n" ); + printf( "============ CHOLESKY RESULTS ============\n" ); + printf( " test %s\n", argv[0]); + printf( " version %s\n", version); + printf( " matrix size: %dx%d\n", n, n); + printf( " tile size: %dx%d\n", ts, ts); + printf( " number of threads: %d\n", omp_get_max_threads()); + printf( " time (s): %f\n", time); + printf( " performance (gflops): %f\n", gflops); + printf( " check: %s\n", result[check]); + printf( "==========================================\n" ); +#else + printf("test, %s, version, %s, n, %d, ts, %d, num_threads, %d, gflops, %f, time, %f, check, %s\n", argv[0], version, n, ts, omp_get_max_threads(), gflops, time, result[check]); +#endif + + // Free matrix + free(matrix); + + return 0; +} + diff --git a/01-tasks/cholesky-tsk/cholesky.c b/01-tasks/cholesky-tsk/cholesky.c new file mode 100644 index 0000000..b86ad10 --- /dev/null +++ b/01-tasks/cholesky-tsk/cholesky.c @@ -0,0 +1,292 @@ +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include "omp.h" + +#if !defined(_OPENMP) + int omp_get_max_threads() { return 1; } + int omp_get_num_threads() { return 1; } +#endif + +void cholesky(int ts, int nt, double* Ah[nt][nt]) +{ +#ifdef VERBOSE + printf("> Computing Cholesky Factorization: indirect blocked matrix...\n"); +#endif + + // TODO: Parallel region is required in order to run in parallel. + // TODO: Kernel invocations could be task candidates. + // TODO: Add scheduler restrictions: taskwait, taskgroup, etc. + for (int k = 0; k < nt; k++) { + // Diagonal Block factorization: using LAPACK + LAPACKE_dpotrf(LAPACK_COL_MAJOR, 'L', ts, Ah[k][k], ts); + + // Triangular systems + for (int i = k + 1; i < nt; i++) { + cblas_dtrsm(CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, ts, ts, 1.0, Ah[k][k], ts, Ah[k][i], ts); + } + + // Update trailing matrix + for (int i = k + 1; i < nt; i++) { + for (int j = k + 1; j < i; j++) { + cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, ts, ts, ts, -1.0, Ah[k][i], ts, Ah[k][j], ts, 1.0, Ah[j][i], ts); + } + cblas_dsyrk(CblasColMajor, CblasLower, CblasNoTrans, ts, ts, -1.0, Ah[k][i], ts, 1.0, Ah[i][i], ts); + } + } + +#ifdef VERBOSE + printf("> ...end of Cholesky Factorization.\n"); +#endif +} + +float get_time() +{ + static double gtod_ref_time_sec = 0.0; + + struct timeval tv; + gettimeofday(&tv, NULL); + + // If this is the first invocation of through dclock(), then initialize the + // "reference time" global variable to the seconds field of the tv struct. + if (gtod_ref_time_sec == 0.0) gtod_ref_time_sec = (double) tv.tv_sec; + + // Normalize the seconds field of the tv struct so that it is relative to the + // "reference time" that was recorded during the first invocation of dclock(). + const double norm_sec = (double) tv.tv_sec - gtod_ref_time_sec; + + // Compute the number of seconds since the reference time. + const double t = norm_sec + tv.tv_usec * 1.0e-6; + + return (float) t; +} + +// Robust Check the factorization of the matrix A2 +// Using directly Fortran services: dlacpy_, dtrmm, dlange_ +static int check_factorization(int N, double *A1, double *A2, int LDA, char uplo, double eps) +{ +#ifdef VERBOSE + printf("> Checking the Cholesky Factorization... \n"); +#endif + char NORM = 'I', ALL = 'A', UP = 'U', LO = 'L', TR = 'T', NU = 'N', RI = 'R'; + + double *Residual = (double *) malloc(N*N*sizeof(double)); + double *L1 = (double *) malloc(N*N*sizeof(double)); + double *L2 = (double *) malloc(N*N*sizeof(double)); + double *work = (double *) malloc(N*sizeof(double)); + + memset((void*)L1, 0, N*N*sizeof(double)); + memset((void*)L2, 0, N*N*sizeof(double)); + + double alpha= 1.0; + + dlacpy_(&ALL, &N, &N, A1, &LDA, Residual, &N); + + /* Dealing with L'L or U'U */ + if (uplo == 'U'){ + dlacpy_(&UP, &N, &N, A2, &LDA, L1, &N); + dlacpy_(&UP, &N, &N, A2, &LDA, L2, &N); + dtrmm(&LO, &uplo, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N); + } else{ + dlacpy_(&LO, &N, &N, A2, &LDA, L1, &N); + dlacpy_(&LO, &N, &N, A2, &LDA, L2, &N); + dtrmm(&RI, &LO, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N); + } + + /* Compute the Residual || A -L'L|| */ + for (int i = 0; i < N; i++) + for (int j = 0; j < N; j++) + Residual[j*N+i] = L2[j*N+i] - Residual[j*N+i]; + + double Rnorm = dlange_(&NORM, &N, &N, Residual, &N, work); + double Anorm = dlange_(&NORM, &N, &N, A1, &N, work); + +#ifdef VERBOSE + printf("> - ||L'L-A||_oo/(||A||_oo.N.eps) = %e \n",Rnorm / (Anorm*N*eps)); +#endif + + const int info_factorization = isnan(Rnorm/(Anorm*N*eps)) || isinf(Rnorm/(Anorm*N*eps)) || (Rnorm/(Anorm*N*eps) > 60.0); + +#ifdef VERBOSE + if ( info_factorization) printf("> - Factorization is suspicious!\n"); + else printf("> - Factorization is CORRECT!\n"); +#endif + + free(Residual); free(L1); free(L2); free(work); + + return info_factorization; +} + +void initialize_matrix(const int n, const int ts, double *matrix) +{ +#ifdef VERBOSE + printf("> Initializing matrix with random values...\n"); +#endif + + int ISEED[4] = {0,0,0,1}; + int intONE=1; + + for (int i = 0; i < n*n; i+=n) { + dlarnv_(&intONE, &ISEED[0], &n, &matrix[i]); + } + + for (int i=0; i Converting linear matrix to blocks...\n"); +#endif + for (int i = 0; i < DIM; i++) + for (int j = 0; j < DIM; j++) { + gather_block ( N, ts, &Alin[i*ts][j*ts], A[i][j]); + } +} + +void convert_to_linear(const int ts, const int DIM, const int N, double *A[DIM][DIM], double Alin[N][N]) +{ +#ifdef VERBOSE + printf("> Converting blocked matrix to linear...\n"); +#endif + for (int i = 0; i < DIM; i++) + for (int j = 0; j < DIM; j++) { + scatter_block ( N, ts, A[i][j], (double *) &Alin[i*ts][j*ts]); + } +} + +int main(int argc, char* argv[]) +{ + char *result[3] = {"n/a","pass","FAIL"}; + const double eps = pow(2.0, -53); + + // If number of arguments is not 4, print help + if ( argc != 4) { + printf( "%s: matrix_size block_size check[0|1]?\n", argv[0] ); + exit( -1 ); + } + + const int n = atoi(argv[1]); // matrix size + const int ts = atoi(argv[2]); // tile size + int check = atoi(argv[3]); // check result? + + // Compute number of tiles + const int nt = n / ts; + assert((nt*ts) == n); // tile size should divide size + + // Allocate matrix + double * const matrix = (double *) malloc(n * n * sizeof(double)); + assert(matrix != NULL); + + // Initialize matrix + initialize_matrix(n, ts, matrix); + + // Allocate original matrix, and duplicate it, for debugging purposes + double * const original_matrix = (double *) malloc(n * n * sizeof(double)); + assert(original_matrix != NULL); + + // Save a copy of the original matrix + for (int i = 0; i < n * n; i++ ) { + original_matrix[i] = matrix[i]; + } + + // Set version description: Indirect blocked matrix + const char *version = "I-Blocked matrix"; + + // Allocate blocked matrix + double *Ah[nt][nt]; + for (int i = 0; i < nt; i++) { + for (int j = 0; j < nt; j++) { + Ah[i][j] = malloc(ts * ts * sizeof(double)); + assert(Ah[i][j] != NULL); + } + } + + // --------------------------------------- + // Convert, compute (time), and re-convert + // --------------------------------------- + convert_to_blocks(ts, nt, n, (double(*)[n]) matrix, Ah); + const float tref = get_time(); + cholesky(ts, nt, (double* (*)[nt]) Ah); + const float time = get_time() - tref; + convert_to_linear(ts, nt, n, Ah, (double (*)[n]) matrix); + + // Free blocked matrix + for (int i = 0; i < nt; i++) { + for (int j = 0; j < nt; j++) { + assert(Ah[i][j] != NULL); + free(Ah[i][j]); + } + } + + // Check result, if requested + if ( check ) { + const char uplo = 'L'; + if ( check_factorization( n, original_matrix, matrix, n, uplo, eps) ) check++; + } + + // Free original matrix, not needed anymore + free(original_matrix); + + // Compute GFLOPs (Not verified) + float gflops = (((1.0 / 3.0) * n * n * n) / ((time) * 1.0e+9)); + + // Print results +#ifdef VERBOSE + printf( "\n" ); + printf( "============ CHOLESKY RESULTS ============\n" ); + printf( " test %s\n", argv[0]); + printf( " version %s\n", version); + printf( " matrix size: %dx%d\n", n, n); + printf( " tile size: %dx%d\n", ts, ts); + printf( " number of threads: %d\n", omp_get_max_threads()); + printf( " time (s): %f\n", time); + printf( " performance (gflops): %f\n", gflops); + printf( " check: %s\n", result[check]); + printf( "==========================================\n" ); +#else + printf("test, %s, version, %s, n, %d, ts, %d, num_threads, %d, gflops, %f, time, %f, check, %s\n", argv[0], version, n, ts, omp_get_max_threads(), gflops, time, result[check]); +#endif + + // Free matrix + free(matrix); + + return 0; +} + diff --git a/01-tasks/sudoku-tsk/Makefile b/01-tasks/sudoku-tsk/Makefile new file mode 100644 index 0000000..b92c3ec --- /dev/null +++ b/01-tasks/sudoku-tsk/Makefile @@ -0,0 +1,25 @@ +BIN_DIR=../bin +PROGRAM=$(BIN_DIR)/sudoku-tsk + +CC=g++ +CFLAGS=-c -O3 -fopenmp -DUSE_SEQUENTIAL_CUTOFF +LDFLAGS=-fopenmp + +OBJS=SudokuBoard.o sudoku.o + +all: $(PROGRAM) + +$(PROGRAM): $(OBJS) + $(CC) -o $@ $^ $(LDFLAGS) + +%.o: %.cpp + $(CC) $(CFLAGS) -o $@ $^ + +$(BIN_DIR): + mkdir $@ + +clean: + rm -rf $(PROGRAM) *.o + +wipe: clean + rm -rf *.out *.err diff --git a/01-tasks/sudoku-tsk/SudokuBoard.cpp b/01-tasks/sudoku-tsk/SudokuBoard.cpp new file mode 100644 index 0000000..139f4fd --- /dev/null +++ b/01-tasks/sudoku-tsk/SudokuBoard.cpp @@ -0,0 +1,119 @@ +/* + * This file is part of Christian's OpenMP Task-parallel Sudoku Solver + * + * Copyright (C) 2013 by Christian Terboven + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "SudokuBoard.h" + +#include + +CSudokuBoard::CSudokuBoard(int fsize, int bsize) + : field_size(fsize), block_size(bsize), solutions(-1) +{ + field = new int[field_size*field_size]; +} + + +CSudokuBoard::CSudokuBoard(const CSudokuBoard& other) + : field_size(other.getFieldSize()), block_size(other.getBlockSize()), solutions(other.getNumSolutions()) +{ + field = new int[field_size*field_size]; + std::memcpy(field, other.field, sizeof(int) * field_size*field_size); +} + + +CSudokuBoard::~CSudokuBoard(void) +{ + delete[] field; +} + + +bool CSudokuBoard::loadFromFile(char *filename) +{ + std::ifstream ifile(filename); + + if (!ifile) { + std::cout << "There was an error opening the input file " << filename << std::endl; + std::cout << std::endl; + return false; + } + + for (int i = 0; i < this->field_size; i++) { + for (int j = 0; j < this->field_size; j++) { + ifile >> this->field[ACCESS(i,j)]; + } + } + + return true; +} + + +void CSudokuBoard::printBoard() +{ + for(int i = 0; i < field_size; i++) { + for(int j = 0; j < field_size; j++) { + std::cout << std::setw(3) << + this->field[ACCESS(i,j)] + << " "; + } + std::cout << std::endl; + } +} + + +bool CSudokuBoard::check(int x, int y, int value) +{ + if(checkHorizontal(y, value)) + return true; + if(checkVertical(x, value)) + return true; + if(checkBox(x, y, value)) + return true; + return false; +} + + +bool CSudokuBoard::checkHorizontal(int y, int value) { + for(int i = 0; i < field_size; i++) + if(field[ACCESS(y,i)] == value) + return true; + return false; +} + + +bool CSudokuBoard::checkVertical(int x, int value) { + for(int i = 0; i < field_size; i++) + if(field[ACCESS(i,x)] == value) + return true; + return false; +} + + +bool CSudokuBoard::checkBox(int x, int y, int value) { + // find suitable box edge + int x_box = (int)(x / block_size) * block_size; + int y_box = (int)(y / block_size) * block_size; + + for(int i = y_box; i < y_box + block_size; i++) + for(int j = x_box; j < x_box + block_size; j++) + if(field[ACCESS(i,j)] == value) + return true; + + return false; +} diff --git a/01-tasks/sudoku-tsk/SudokuBoard.h b/01-tasks/sudoku-tsk/SudokuBoard.h new file mode 100644 index 0000000..f6c9c74 --- /dev/null +++ b/01-tasks/sudoku-tsk/SudokuBoard.h @@ -0,0 +1,96 @@ +/* + * This file is part of Christian's OpenMP Task-parallel Sudoku Solver + * + * Copyright (C) 2013 by Christian Terboven + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include +#include +#include + +#define ACCESS(x, y) (this->field_size*(x) + (y)) + +/** + * @brief Representation of the Sudoku board, includes solver and utility functions + */ +class CSudokuBoard +{ +public: + CSudokuBoard(int fsize, int bsize); + CSudokuBoard(const CSudokuBoard& other); + ~CSudokuBoard(void); + + inline int getNumSolutions() const { + return this->solutions; + } + + inline void incrementSolutionCounter() { + if (this->solutions == -1) + this->solutions = 1; + else + this->solutions++; + } + + inline int getFieldSize() const { + return this->field_size; + } + + inline int getBlockSize() const { + return this->block_size; + } + + inline int get(int x, int y) const { + return this->field[ACCESS(x,y)]; + } + + inline void set(int x, int y, int value) { + this->field[ACCESS(x, y)] = value; + } + + /** + * Read Sudoku template from file + * @param filename name of file to read input from + * @return true if reading file was successful, false if not + */ + bool loadFromFile(char *filename); + + /** + * Print the Sudoku board to stdout + */ + void printBoard(); + + /** + * Check whether value already used + * @return true if already used, false if not + */ + bool check(int x, int y, int value); + + +private: + bool checkHorizontal(int y, int value); + bool checkVertical(int x, int value); + bool checkBox(int x, int y, int value); + + int field_size; + int block_size; + + int *field; + + int solutions; +}; diff --git a/01-tasks/sudoku-tsk/sudoku-solved.cpp b/01-tasks/sudoku-tsk/sudoku-solved.cpp new file mode 100644 index 0000000..55c2d3a --- /dev/null +++ b/01-tasks/sudoku-tsk/sudoku-solved.cpp @@ -0,0 +1,177 @@ +/* + * This file is part of Christian's OpenMP Task-parallel Sudoku Solver + * + * Copyright (C) 2013 by Christian Terboven + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include + +#include +#include + +#include + +#include "SudokuBoard.h" + + +int found_sudokus = 0; + + +/** + * Solve the Sudoku puzzle by finding and counting all solutions (if all are requested) + */ +bool solve(int x, int y, CSudokuBoard* sudoku, bool printAnyFoundSolution = true, bool findAllSolutions = true) +{ + if(x == sudoku->getFieldSize()) { // end of line + y++; + x = 0; + if(y == sudoku->getFieldSize()) // end + return true; + } + + if(sudoku->get(y, x) > 0) // field already set + return solve(x+1, y, sudoku, printAnyFoundSolution, findAllSolutions); // tackle next field + + for(int i = 1; i <= sudoku->getFieldSize(); i++) { // try all numbers + if(!sudoku->check(x, y, i)) { + sudoku->set(y, x, i); // if number fits, set it + if(solve(x+1, y, sudoku, printAnyFoundSolution, findAllSolutions)) { // tackle next field +#pragma omp atomic + found_sudokus++; + sudoku->incrementSolutionCounter(); // solution found :-) + if (printAnyFoundSolution) { + std::cout << "The following is a valid solution:" << std::endl; + sudoku->printBoard(); + std::cout << std::endl; + } + if (!findAllSolutions) { + return true; // return, as only one solution was asked for + } + } + } + } + + sudoku->set(y, x, 0); // no solution found, reset field + return false; +} + + + +/** + * Solve the Sudoku puzzle in parallel by finding and counting all solutions + */ +bool solve_parallel(int x, int y, CSudokuBoard* sudoku, bool printAnyFoundSolution = true) +{ + if (x == sudoku->getFieldSize()) { // end of line + y++; + x = 0; + if(y == sudoku->getFieldSize()) // end + return true; + } + + if (sudoku->get(y, x) > 0) { // field already set + return solve_parallel(x+1, y, sudoku, printAnyFoundSolution); // tackle next field + } + +#if USE_SEQUENTIAL_CUTOFF + if ( y > 1 ) { + return solve(x, y, sudoku, printAnyFoundSolution); + } +#endif + + for (int i = 1; i <= sudoku->getFieldSize(); i++) { // try all numbers + if (!sudoku->check(x, y, i)) { +#pragma omp task firstprivate(i,x,y,sudoku) + { + CSudokuBoard new_sudoku(*sudoku); + new_sudoku.set(y, x, i); // if number fits, set it + + if (solve_parallel(x+1, y, &new_sudoku, printAnyFoundSolution)) { // tackle next field +#pragma omp atomic + found_sudokus++; + // sudoku->incrementSolutionCounter(); // solution found :-) + if (printAnyFoundSolution) { + std::cout << "The following is a valid solution:" << std::endl; + new_sudoku.printBoard(); + std::cout << std::endl; + } + } + } + } + } + +#pragma omp taskwait + sudoku->set(y, x, 0); // no solution found, reset field + return false; +} + + + +/** @brief program entry point + */ +int main(int argc, char* argv[]) { + // measure the time + double t3, t4; + int nthreads = 0; + + // expect three command line arguments: field size, block size, and input file + if (argc != 4) { + std::cout << "Usage: sudoku.exe " << std::endl; + std::cout << std::endl; + return -1; + } + else { + CSudokuBoard *sudoku1 = new CSudokuBoard(atoi(argv[1]), atoi(argv[2])); + if (!sudoku1->loadFromFile(argv[3])) { + std::cout << "There was an error reading a Sudoku template from " << argv[3] << std::endl; + std::cout << std::endl; + return -1; + } + + // print the Sudoku board template + std::cout << "Given Sudoku template" << std::endl; + sudoku1->printBoard(); + + // solve the Sudoku by finding (and printing) all solutions + t3 = omp_get_wtime(); + #pragma omp parallel sections + { + solve_parallel(0, 0, sudoku1, false); + } + t4 = omp_get_wtime(); + +#pragma omp parallel + { +#pragma omp master + nthreads = omp_get_num_threads(); + } + + std::cout << std::endl; + std::cout << "In total there were " << sudoku1->getNumSolutions() << " solutions found in serial." << std::endl; + std::cout << "In total there were " << found_sudokus << " solutions found in parallel " << std::endl; + std::cout << std::endl; + + delete sudoku1; + } + + // print the time + std::cout << "Parallel computation took " << t4 - t3 << " seconds (" + << nthreads << " threads)." << std::endl << std::endl; + + return 0; +} diff --git a/01-tasks/sudoku-tsk/sudoku.cpp b/01-tasks/sudoku-tsk/sudoku.cpp new file mode 100644 index 0000000..bbce98a --- /dev/null +++ b/01-tasks/sudoku-tsk/sudoku.cpp @@ -0,0 +1,169 @@ +/* + * This file is part of Christian's OpenMP Task-parallel Sudoku Solver + * + * Copyright (C) 2013 by Christian Terboven + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include + +#include +#include + +#include + +#include "SudokuBoard.h" + + +int found_sudokus = 0; + + +/** + * Solve the Sudoku puzzle by finding and counting all solutions (if all are requested) + */ +bool solve(int x, int y, CSudokuBoard* sudoku, bool printAnyFoundSolution = true, bool findAllSolutions = true) +{ + if(x == sudoku->getFieldSize()) { // end of line + y++; + x = 0; + if(y == sudoku->getFieldSize()) // end + return true; + } + + if(sudoku->get(y, x) > 0) // field already set + return solve(x+1, y, sudoku, printAnyFoundSolution, findAllSolutions); // tackle next field + + for(int i = 1; i <= sudoku->getFieldSize(); i++) { // try all numbers + if(!sudoku->check(x, y, i)) { + sudoku->set(y, x, i); // if number fits, set it + if(solve(x+1, y, sudoku, printAnyFoundSolution, findAllSolutions)) { // tackle next field +#pragma omp atomic + found_sudokus++; + sudoku->incrementSolutionCounter(); // solution found :-) + if (printAnyFoundSolution) { + std::cout << "The following is a valid solution:" << std::endl; + sudoku->printBoard(); + std::cout << std::endl; + } + if (!findAllSolutions) { + return true; // return, as only one solution was asked for + } + } + } + } + + sudoku->set(y, x, 0); // no solution found, reset field + return false; +} + + + +/** + * Solve the Sudoku puzzle in parallel by finding and counting all solutions + */ +bool solve_parallel(int x, int y, CSudokuBoard* sudoku, bool printAnyFoundSolution = true) +{ + if (x == sudoku->getFieldSize()) { // end of line + y++; + x = 0; + if(y == sudoku->getFieldSize()) // end + return true; + } + + if (sudoku->get(y, x) > 0) { // field already set + return solve_parallel(x+1, y, sudoku, printAnyFoundSolution); // tackle next field + } + +#if USE_SEQUENTIAL_CUTOFF + if ( y > 1 ) { + return solve(x, y, sudoku, printAnyFoundSolution); + } +#endif + +// TODO: Task candidates (for each possible value?) + for (int i = 1; i <= sudoku->getFieldSize(); i++) { // try all numbers + if (!sudoku->check(x, y, i)) { + { + CSudokuBoard new_sudoku(*sudoku); + new_sudoku.set(y, x, i); // if number fits, set it + + if (solve_parallel(x+1, y, &new_sudoku, printAnyFoundSolution)) { // tackle next field +#pragma omp atomic + found_sudokus++; + // sudoku->incrementSolutionCounter(); // solution found :-) + if (printAnyFoundSolution) { + std::cout << "The following is a valid solution:" << std::endl; + new_sudoku.printBoard(); + std::cout << std::endl; + } + } + } + } + } + +// TODO: After creating tasks for this level we will need to synchronize + sudoku->set(y, x, 0); // no solution found, reset field + return false; +} + + + +/** @brief program entry point + */ +int main(int argc, char* argv[]) { + // measure the time + double t3, t4; + + // expect three command line arguments: field size, block size, and input file + if (argc != 4) { + std::cout << "Usage: sudoku.exe " << std::endl; + std::cout << std::endl; + return -1; + } + else { + CSudokuBoard *sudoku1 = new CSudokuBoard(atoi(argv[1]), atoi(argv[2])); + if (!sudoku1->loadFromFile(argv[3])) { + std::cout << "There was an error reading a Sudoku template from " << argv[3] << std::endl; + std::cout << std::endl; + return -1; + } + + // print the Sudoku board template + std::cout << "Given Sudoku template" << std::endl; + sudoku1->printBoard(); + + // solve the Sudoku by finding (and printing) all solutions + t3 = omp_get_wtime(); + // TODO: Sudoku solver needs to execute within a parallel region + { + solve_parallel(0, 0, sudoku1, false); + } + t4 = omp_get_wtime(); + + std::cout << std::endl; + std::cout << "In total there were " << found_sudokus << " solutions found in parallel " << std::endl; + std::cout << std::endl; + + delete sudoku1; + } + + // print the time + std::cout << "Parallel computation took " << t4 - t3 << " seconds (" + << omp_get_max_threads() << " threads)." << std::endl << std::endl; + + return 0; +} diff --git a/02-dependencies/Makefile b/02-dependencies/Makefile new file mode 100644 index 0000000..9d1cee5 --- /dev/null +++ b/02-dependencies/Makefile @@ -0,0 +1,14 @@ +ALL_RECURSIVE=all-recursive clean-recursive + +DIRS=$(shell ls -d *-* | sed '/_/d' 2>/dev/null) + +all: $(CONFIG) all-recursive +clean: $(CONFIG) clean-recursive + +$(ALL_RECURSIVE): + @failcom='exit 1';\ + target=`echo $@ | sed s/-recursive//`; \ + for subdir in $(DIRS); do \ + (cd $$subdir && $(MAKE) $$target) || eval $$failcom; \ + done; + diff --git a/02-dependencies/bin/.gitdir b/02-dependencies/bin/.gitdir new file mode 100644 index 0000000..e69de29 diff --git a/02-dependencies/bin/run-cholesky.sh b/02-dependencies/bin/run-cholesky.sh new file mode 100755 index 0000000..22fdc2f --- /dev/null +++ b/02-dependencies/bin/run-cholesky.sh @@ -0,0 +1,29 @@ +#!/bin/bash +#SBATCH --job-name=cholesky +#SBATCH --workdir=. +#SBATCH --output=cholesky_%j.out +#SBATCH --error=cholesky_%j.err +#SBATCH --cpus-per-task=48 +#SBATCH --ntasks=1 +#SBATCH --time=00:30:00 +#SBATCH --qos=debug + +export IFS=";" + +THREADS="01;02;04;06;08;10;12;14;16;18;20;22;24;26;28;30;32;34;36;38;40;42;44;46;48" +#THREADS="01;02;04;06;08;10;12" + +#MSIZES="2048;4096;8192" +MSIZES="8192" +MSIZES="16384" +#MSIZES="2048" + +BSIZES="256" + +for MS in $MSIZES; do + for BS in $BSIZES; do + for threads in $THREADS; do + OMP_NUM_THREADS=$threads ./cholesky-tsk-dep $MS $BS 0 + done + done +done diff --git a/02-dependencies/cholesky-tsk-dep/Makefile b/02-dependencies/cholesky-tsk-dep/Makefile new file mode 100644 index 0000000..107fe8f --- /dev/null +++ b/02-dependencies/cholesky-tsk-dep/Makefile @@ -0,0 +1,23 @@ +BIN_DIR=../bin +PROGRAM=$(BIN_DIR)/cholesky-tsk-dep + +MKL_HOME=$(MKLROOT) + +CC=gcc +CFLAGS=-O3 -std=gnu99 -I$(MKL_HOME)/include -fopenmp +LDFLAGS=-L$(MKL_HOME)/lib/intel64 -lmkl_sequential -lmkl_core -lmkl_rt -lpthread -lm -fopenmp +#VERBOSE=-DVERBOSE + +all: $(PROGRAM) + +$(PROGRAM): cholesky.c + $(CC) $(CFLAGS) $(VERBOSE) -o $@ $^ $(LDFLAGS) + +$(BIN_DIR): + mkdir $@ + +clean: + rm -rf $(PROGRAM) *.o + +wipe: clean + rm -rf *.out *.err diff --git a/02-dependencies/cholesky-tsk-dep/cholesky-solved.c b/02-dependencies/cholesky-tsk-dep/cholesky-solved.c new file mode 100644 index 0000000..4807e35 --- /dev/null +++ b/02-dependencies/cholesky-tsk-dep/cholesky-solved.c @@ -0,0 +1,296 @@ +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include "omp.h" + +#if !defined(_OPENMP) + int omp_get_max_threads() { return 1; } + int omp_get_num_threads() { return 1; } +#endif + +void cholesky(int ts, int nt, double* Ah[nt][nt]) +{ +#ifdef VERBOSE + printf("> Computing Cholesky Factorization: indirect blocked matrix...\n"); +#endif + + #pragma omp parallel + #pragma omp single + for (int k = 0; k < nt; k++) { + // Diagonal Block factorization: using LAPACK + #pragma omp task depend(inout: Ah[k][k]) + LAPACKE_dpotrf(LAPACK_COL_MAJOR, 'L', ts, Ah[k][k], ts); + + // Triangular systems + for (int i = k + 1; i < nt; i++) { + #pragma omp task depend(in: Ah[k][k]) depend(inout: Ah[k][i]) + cblas_dtrsm(CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, ts, ts, 1.0, Ah[k][k], ts, Ah[k][i], ts); + } + + // Update trailing matrix + for (int i = k + 1; i < nt; i++) { + for (int j = k + 1; j < i; j++) { + #pragma omp task depend(in: Ah[k][i], Ah[k][j]) depend(inout: Ah[j][i]) + cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, ts, ts, ts, -1.0, Ah[k][i], ts, Ah[k][j], ts, 1.0, Ah[j][i], ts); + } + #pragma omp task depend(in: Ah[k][i]) depend(inout: Ah[i][i]) + cblas_dsyrk(CblasColMajor, CblasLower, CblasNoTrans, ts, ts, -1.0, Ah[k][i], ts, 1.0, Ah[i][i], ts); + } + } + + #pragma omp taskwait +#ifdef VERBOSE + printf("> ...end of Cholesky Factorization.\n"); +#endif +} + +float get_time() +{ + static double gtod_ref_time_sec = 0.0; + + struct timeval tv; + gettimeofday(&tv, NULL); + + // If this is the first invocation of through dclock(), then initialize the + // "reference time" global variable to the seconds field of the tv struct. + if (gtod_ref_time_sec == 0.0) gtod_ref_time_sec = (double) tv.tv_sec; + + // Normalize the seconds field of the tv struct so that it is relative to the + // "reference time" that was recorded during the first invocation of dclock(). + const double norm_sec = (double) tv.tv_sec - gtod_ref_time_sec; + + // Compute the number of seconds since the reference time. + const double t = norm_sec + tv.tv_usec * 1.0e-6; + + return (float) t; +} + +// Robust Check the factorization of the matrix A2 +// Using directly Fortran services: dlacpy_, dtrmm, dlange_ +static int check_factorization(int N, double *A1, double *A2, int LDA, char uplo, double eps) +{ +#ifdef VERBOSE + printf("> Checking the Cholesky Factorization... \n"); +#endif + char NORM = 'I', ALL = 'A', UP = 'U', LO = 'L', TR = 'T', NU = 'N', RI = 'R'; + + double *Residual = (double *) malloc(N*N*sizeof(double)); + double *L1 = (double *) malloc(N*N*sizeof(double)); + double *L2 = (double *) malloc(N*N*sizeof(double)); + double *work = (double *) malloc(N*sizeof(double)); + + memset((void*)L1, 0, N*N*sizeof(double)); + memset((void*)L2, 0, N*N*sizeof(double)); + + double alpha= 1.0; + + dlacpy_(&ALL, &N, &N, A1, &LDA, Residual, &N); + + /* Dealing with L'L or U'U */ + if (uplo == 'U'){ + dlacpy_(&UP, &N, &N, A2, &LDA, L1, &N); + dlacpy_(&UP, &N, &N, A2, &LDA, L2, &N); + dtrmm(&LO, &uplo, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N); + } else{ + dlacpy_(&LO, &N, &N, A2, &LDA, L1, &N); + dlacpy_(&LO, &N, &N, A2, &LDA, L2, &N); + dtrmm(&RI, &LO, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N); + } + + /* Compute the Residual || A -L'L|| */ + for (int i = 0; i < N; i++) + for (int j = 0; j < N; j++) + Residual[j*N+i] = L2[j*N+i] - Residual[j*N+i]; + + double Rnorm = dlange_(&NORM, &N, &N, Residual, &N, work); + double Anorm = dlange_(&NORM, &N, &N, A1, &N, work); + +#ifdef VERBOSE + printf("> - ||L'L-A||_oo/(||A||_oo.N.eps) = %e \n",Rnorm / (Anorm*N*eps)); +#endif + + const int info_factorization = isnan(Rnorm/(Anorm*N*eps)) || isinf(Rnorm/(Anorm*N*eps)) || (Rnorm/(Anorm*N*eps) > 60.0); + +#ifdef VERBOSE + if ( info_factorization) printf("> - Factorization is suspicious!\n"); + else printf("> - Factorization is CORRECT!\n"); +#endif + + free(Residual); free(L1); free(L2); free(work); + + return info_factorization; +} + +void initialize_matrix(const int n, const int ts, double *matrix) +{ +#ifdef VERBOSE + printf("> Initializing matrix with random values...\n"); +#endif + + int ISEED[4] = {0,0,0,1}; + int intONE=1; + + for (int i = 0; i < n*n; i+=n) { + dlarnv_(&intONE, &ISEED[0], &n, &matrix[i]); + } + + for (int i=0; i Converting linear matrix to blocks...\n"); +#endif + for (int i = 0; i < DIM; i++) + for (int j = 0; j < DIM; j++) { + gather_block ( N, ts, &Alin[i*ts][j*ts], A[i][j]); + } +} + +void convert_to_linear(const int ts, const int DIM, const int N, double *A[DIM][DIM], double Alin[N][N]) +{ +#ifdef VERBOSE + printf("> Converting blocked matrix to linear...\n"); +#endif + for (int i = 0; i < DIM; i++) + for (int j = 0; j < DIM; j++) { + scatter_block ( N, ts, A[i][j], (double *) &Alin[i*ts][j*ts]); + } +} + +int main(int argc, char* argv[]) +{ + char *result[3] = {"n/a","pass","FAIL"}; + const double eps = pow(2.0, -53); + + // If number of arguments is not 4, print help + if ( argc != 4) { + printf( "%s: matrix_size block_size check[0|1]?\n", argv[0] ); + exit( -1 ); + } + + const int n = atoi(argv[1]); // matrix size + const int ts = atoi(argv[2]); // tile size + int check = atoi(argv[3]); // check result? + + // Compute number of tiles + const int nt = n / ts; + assert((nt*ts) == n); // tile size should divide size + + // Allocate matrix + double * const matrix = (double *) malloc(n * n * sizeof(double)); + assert(matrix != NULL); + + // Initialize matrix + initialize_matrix(n, ts, matrix); + + // Allocate original matrix, and duplicate it, for debugging purposes + double * const original_matrix = (double *) malloc(n * n * sizeof(double)); + assert(original_matrix != NULL); + + // Save a copy of the original matrix + for (int i = 0; i < n * n; i++ ) { + original_matrix[i] = matrix[i]; + } + + // Set version description: Indirect blocked matrix + const char *version = "I-Blocked matrix"; + + // Allocate blocked matrix + double *Ah[nt][nt]; + for (int i = 0; i < nt; i++) { + for (int j = 0; j < nt; j++) { + Ah[i][j] = malloc(ts * ts * sizeof(double)); + assert(Ah[i][j] != NULL); + } + } + + // --------------------------------------- + // Convert, compute (time), and re-convert + // --------------------------------------- + convert_to_blocks(ts, nt, n, (double(*)[n]) matrix, Ah); + const float tref = get_time(); + cholesky(ts, nt, (double* (*)[nt]) Ah); + const float time = get_time() - tref; + convert_to_linear(ts, nt, n, Ah, (double (*)[n]) matrix); + + // Free blocked matrix + for (int i = 0; i < nt; i++) { + for (int j = 0; j < nt; j++) { + assert(Ah[i][j] != NULL); + free(Ah[i][j]); + } + } + + // Check result, if requested + if ( check ) { + const char uplo = 'L'; + if ( check_factorization( n, original_matrix, matrix, n, uplo, eps) ) check++; + } + + // Free original matrix, not needed anymore + free(original_matrix); + + // Compute GFLOPs (Not verified) + float gflops = (((1.0 / 3.0) * n * n * n) / ((time) * 1.0e+9)); + + // Print results +#ifdef VERBOSE + printf( "\n" ); + printf( "============ CHOLESKY RESULTS ============\n" ); + printf( " test %s\n", argv[0]); + printf( " version %s\n", version); + printf( " matrix size: %dx%d\n", n, n); + printf( " tile size: %dx%d\n", ts, ts); + printf( " number of threads: %d\n", omp_get_max_threads()); + printf( " time (s): %f\n", time); + printf( " performance (gflops): %f\n", gflops); + printf( " check: %s\n", result[check]); + printf( "==========================================\n" ); +#else + printf("test, %s, version, %s, n, %d, ts, %d, num_threads, %d, gflops, %f, time, %f, check, %s\n", argv[0], version, n, ts, omp_get_max_threads(), gflops, time, result[check]); +#endif + + // Free matrix + free(matrix); + + return 0; +} + diff --git a/02-dependencies/cholesky-tsk-dep/cholesky.c b/02-dependencies/cholesky-tsk-dep/cholesky.c new file mode 100644 index 0000000..521e978 --- /dev/null +++ b/02-dependencies/cholesky-tsk-dep/cholesky.c @@ -0,0 +1,293 @@ +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include "omp.h" + +#if !defined(_OPENMP) + int omp_get_max_threads() { return 1; } + int omp_get_num_threads() { return 1; } +#endif + +void cholesky(int ts, int nt, double* Ah[nt][nt]) +{ +#ifdef VERBOSE + printf("> Computing Cholesky Factorization: indirect blocked matrix...\n"); +#endif + + // TODO: Parallel region is required in order to run in parallel. + // TODO: Kernel invocations could be task candidates. + // TODO: Add scheduler restrictions between tasks using dependences. + for (int k = 0; k < nt; k++) { + // Diagonal Block factorization: using LAPACK + LAPACKE_dpotrf(LAPACK_COL_MAJOR, 'L', ts, Ah[k][k], ts); + + // Triangular systems + for (int i = k + 1; i < nt; i++) { + cblas_dtrsm(CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, ts, ts, 1.0, Ah[k][k], ts, Ah[k][i], ts); + } + + // Update trailing matrix + for (int i = k + 1; i < nt; i++) { + for (int j = k + 1; j < i; j++) { + cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, ts, ts, ts, -1.0, Ah[k][i], ts, Ah[k][j], ts, 1.0, Ah[j][i], ts); + } + cblas_dsyrk(CblasColMajor, CblasLower, CblasNoTrans, ts, ts, -1.0, Ah[k][i], ts, 1.0, Ah[i][i], ts); + } + } + +// TODO: we need to guarantee all the work has finished at this point +#ifdef VERBOSE + printf("> ...end of Cholesky Factorization.\n"); +#endif +} + +float get_time() +{ + static double gtod_ref_time_sec = 0.0; + + struct timeval tv; + gettimeofday(&tv, NULL); + + // If this is the first invocation of through dclock(), then initialize the + // "reference time" global variable to the seconds field of the tv struct. + if (gtod_ref_time_sec == 0.0) gtod_ref_time_sec = (double) tv.tv_sec; + + // Normalize the seconds field of the tv struct so that it is relative to the + // "reference time" that was recorded during the first invocation of dclock(). + const double norm_sec = (double) tv.tv_sec - gtod_ref_time_sec; + + // Compute the number of seconds since the reference time. + const double t = norm_sec + tv.tv_usec * 1.0e-6; + + return (float) t; +} + +// Robust Check the factorization of the matrix A2 +// Using directly Fortran services: dlacpy_, dtrmm, dlange_ +static int check_factorization(int N, double *A1, double *A2, int LDA, char uplo, double eps) +{ +#ifdef VERBOSE + printf("> Checking the Cholesky Factorization... \n"); +#endif + char NORM = 'I', ALL = 'A', UP = 'U', LO = 'L', TR = 'T', NU = 'N', RI = 'R'; + + double *Residual = (double *) malloc(N*N*sizeof(double)); + double *L1 = (double *) malloc(N*N*sizeof(double)); + double *L2 = (double *) malloc(N*N*sizeof(double)); + double *work = (double *) malloc(N*sizeof(double)); + + memset((void*)L1, 0, N*N*sizeof(double)); + memset((void*)L2, 0, N*N*sizeof(double)); + + double alpha= 1.0; + + dlacpy_(&ALL, &N, &N, A1, &LDA, Residual, &N); + + /* Dealing with L'L or U'U */ + if (uplo == 'U'){ + dlacpy_(&UP, &N, &N, A2, &LDA, L1, &N); + dlacpy_(&UP, &N, &N, A2, &LDA, L2, &N); + dtrmm(&LO, &uplo, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N); + } else{ + dlacpy_(&LO, &N, &N, A2, &LDA, L1, &N); + dlacpy_(&LO, &N, &N, A2, &LDA, L2, &N); + dtrmm(&RI, &LO, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N); + } + + /* Compute the Residual || A -L'L|| */ + for (int i = 0; i < N; i++) + for (int j = 0; j < N; j++) + Residual[j*N+i] = L2[j*N+i] - Residual[j*N+i]; + + double Rnorm = dlange_(&NORM, &N, &N, Residual, &N, work); + double Anorm = dlange_(&NORM, &N, &N, A1, &N, work); + +#ifdef VERBOSE + printf("> - ||L'L-A||_oo/(||A||_oo.N.eps) = %e \n",Rnorm / (Anorm*N*eps)); +#endif + + const int info_factorization = isnan(Rnorm/(Anorm*N*eps)) || isinf(Rnorm/(Anorm*N*eps)) || (Rnorm/(Anorm*N*eps) > 60.0); + +#ifdef VERBOSE + if ( info_factorization) printf("> - Factorization is suspicious!\n"); + else printf("> - Factorization is CORRECT!\n"); +#endif + + free(Residual); free(L1); free(L2); free(work); + + return info_factorization; +} + +void initialize_matrix(const int n, const int ts, double *matrix) +{ +#ifdef VERBOSE + printf("> Initializing matrix with random values...\n"); +#endif + + int ISEED[4] = {0,0,0,1}; + int intONE=1; + + for (int i = 0; i < n*n; i+=n) { + dlarnv_(&intONE, &ISEED[0], &n, &matrix[i]); + } + + for (int i=0; i Converting linear matrix to blocks...\n"); +#endif + for (int i = 0; i < DIM; i++) + for (int j = 0; j < DIM; j++) { + gather_block ( N, ts, &Alin[i*ts][j*ts], A[i][j]); + } +} + +void convert_to_linear(const int ts, const int DIM, const int N, double *A[DIM][DIM], double Alin[N][N]) +{ +#ifdef VERBOSE + printf("> Converting blocked matrix to linear...\n"); +#endif + for (int i = 0; i < DIM; i++) + for (int j = 0; j < DIM; j++) { + scatter_block ( N, ts, A[i][j], (double *) &Alin[i*ts][j*ts]); + } +} + +int main(int argc, char* argv[]) +{ + char *result[3] = {"n/a","pass","FAIL"}; + const double eps = pow(2.0, -53); + + // If number of arguments is not 4, print help + if ( argc != 4) { + printf( "%s: matrix_size block_size check[0|1]?\n", argv[0] ); + exit( -1 ); + } + + const int n = atoi(argv[1]); // matrix size + const int ts = atoi(argv[2]); // tile size + int check = atoi(argv[3]); // check result? + + // Compute number of tiles + const int nt = n / ts; + assert((nt*ts) == n); // tile size should divide size + + // Allocate matrix + double * const matrix = (double *) malloc(n * n * sizeof(double)); + assert(matrix != NULL); + + // Initialize matrix + initialize_matrix(n, ts, matrix); + + // Allocate original matrix, and duplicate it, for debugging purposes + double * const original_matrix = (double *) malloc(n * n * sizeof(double)); + assert(original_matrix != NULL); + + // Save a copy of the original matrix + for (int i = 0; i < n * n; i++ ) { + original_matrix[i] = matrix[i]; + } + + // Set version description: Indirect blocked matrix + const char *version = "I-Blocked matrix"; + + // Allocate blocked matrix + double *Ah[nt][nt]; + for (int i = 0; i < nt; i++) { + for (int j = 0; j < nt; j++) { + Ah[i][j] = malloc(ts * ts * sizeof(double)); + assert(Ah[i][j] != NULL); + } + } + + // --------------------------------------- + // Convert, compute (time), and re-convert + // --------------------------------------- + convert_to_blocks(ts, nt, n, (double(*)[n]) matrix, Ah); + const float tref = get_time(); + cholesky(ts, nt, (double* (*)[nt]) Ah); + const float time = get_time() - tref; + convert_to_linear(ts, nt, n, Ah, (double (*)[n]) matrix); + + // Free blocked matrix + for (int i = 0; i < nt; i++) { + for (int j = 0; j < nt; j++) { + assert(Ah[i][j] != NULL); + free(Ah[i][j]); + } + } + + // Check result, if requested + if ( check ) { + const char uplo = 'L'; + if ( check_factorization( n, original_matrix, matrix, n, uplo, eps) ) check++; + } + + // Free original matrix, not needed anymore + free(original_matrix); + + // Compute GFLOPs (Not verified) + float gflops = (((1.0 / 3.0) * n * n * n) / ((time) * 1.0e+9)); + + // Print results +#ifdef VERBOSE + printf( "\n" ); + printf( "============ CHOLESKY RESULTS ============\n" ); + printf( " test %s\n", argv[0]); + printf( " version %s\n", version); + printf( " matrix size: %dx%d\n", n, n); + printf( " tile size: %dx%d\n", ts, ts); + printf( " number of threads: %d\n", omp_get_max_threads()); + printf( " time (s): %f\n", time); + printf( " performance (gflops): %f\n", gflops); + printf( " check: %s\n", result[check]); + printf( "==========================================\n" ); +#else + printf("test, %s, version, %s, n, %d, ts, %d, num_threads, %d, gflops, %f, time, %f, check, %s\n", argv[0], version, n, ts, omp_get_max_threads(), gflops, time, result[check]); +#endif + + // Free matrix + free(matrix); + + return 0; +} + diff --git a/03-cut-offs/Makefile b/03-cut-offs/Makefile new file mode 100644 index 0000000..7a67e63 --- /dev/null +++ b/03-cut-offs/Makefile @@ -0,0 +1,14 @@ +ALL_RECURSIVE=all-recursive clean-recursive + +DIRS=$(shell ls -d */ | sed '/_/d' 2>/dev/null | sed '/bin/d' 2>/dev/null) + +all: all-recursive +clean: clean-recursive + +$(ALL_RECURSIVE): + @failcom='exit 1';\ + target=`echo $@ | sed s/-recursive//`; \ + for subdir in $(DIRS); do \ + (cd $$subdir && $(MAKE) $$target) || eval $$failcom; \ + done; + diff --git a/03-cut-offs/bin/.gitdir b/03-cut-offs/bin/.gitdir new file mode 100644 index 0000000..e69de29 diff --git a/03-cut-offs/bin/run-mergesort.sh b/03-cut-offs/bin/run-mergesort.sh new file mode 100755 index 0000000..748cb3f --- /dev/null +++ b/03-cut-offs/bin/run-mergesort.sh @@ -0,0 +1,24 @@ +#!/bin/bash +#SBATCH --job-name=mergesort +#SBATCH --workdir=. +#SBATCH --output=mergesort_%j.out +#SBATCH --error=mergesort_%j.err +#SBATCH --cpus-per-task=48 +#SBATCH --ntasks=1 +#SBATCH --time=00:30:00 +#SBATCH --qos=debug + +export IFS=";" + +THREADS="01;02;04;06;08;10;12;14;16;18;20;22;24;26;28;30;32;34;36;38;40;42;44;46;48" +#THREADS="01;02;04;06;08;10;12" + +ASIZES="1000000" + +for AS in $ASIZES; do + for threads in $THREADS; do + echo Running mergesort-co with $threads + OMP_NUM_THREADS=$threads ./mergesort-co $AS + echo -- + done +done diff --git a/03-cut-offs/bin/run-sudoku.sh b/03-cut-offs/bin/run-sudoku.sh new file mode 100755 index 0000000..618ef9b --- /dev/null +++ b/03-cut-offs/bin/run-sudoku.sh @@ -0,0 +1,20 @@ +#!/bin/bash +#SBATCH --job-name=sudoku +#SBATCH --workdir=. +#SBATCH --output=sudoku_%j.out +#SBATCH --error=sudoku_%j.err +#SBATCH --cpus-per-task=48 +#SBATCH --ntasks=1 +#SBATCH --time=00:30:00 +#SBATCH --qos=debug + +export IFS=";" + +THREADS="01;02;04;06;08;10;12;14;16;18;20;22;24;26;28;30;32;34;36;38;40;42;44;46;48" +#THREADS="01;02;04;06;08;10;12" + +for threads in $THREADS; do + #OMP_NUM_THREADS=$threads ./sudoku-co 9 3 sudoku-9x9-1.txt + OMP_NUM_THREADS=$threads ./sudoku-co 16 4 sudoku-16x16-1.txt + #OMP_NUM_THREADS=$threads ./sudoku-co 25 5 sudoku-25x25-1.txt +done diff --git a/03-cut-offs/bin/sudoku-16x16-1.txt b/03-cut-offs/bin/sudoku-16x16-1.txt new file mode 100644 index 0000000..b3eb632 --- /dev/null +++ b/03-cut-offs/bin/sudoku-16x16-1.txt @@ -0,0 +1,16 @@ +0 6 0 0 0 0 0 8 11 0 0 15 14 0 0 16 +15 11 0 0 0 16 14 0 0 0 12 0 0 6 0 0 +13 0 9 12 0 0 0 0 3 16 14 0 15 11 10 0 +2 0 16 0 11 0 15 10 1 0 0 0 0 0 0 0 +0 15 11 10 0 0 16 2 13 8 9 12 0 0 0 0 +12 13 0 0 4 1 5 6 2 3 0 0 0 0 11 10 +5 0 6 1 12 0 9 0 15 11 10 7 16 0 0 3 +0 2 0 0 0 10 0 11 6 0 5 0 0 13 0 9 +10 7 15 11 16 0 0 0 12 13 0 0 0 0 0 6 +9 0 0 0 0 0 1 0 0 2 0 16 10 0 0 11 +1 0 4 6 9 13 0 0 7 0 11 0 3 16 0 0 +16 14 0 0 7 0 10 15 4 6 1 0 0 0 13 8 +11 10 0 15 0 0 0 16 9 12 13 0 0 1 5 4 +0 0 12 0 1 4 6 0 16 0 0 0 11 10 0 0 +0 0 5 0 8 12 13 0 10 0 0 11 2 0 0 14 +3 16 0 0 10 0 0 7 0 0 6 0 0 0 12 0 diff --git a/03-cut-offs/bin/sudoku-25x25-1.txt b/03-cut-offs/bin/sudoku-25x25-1.txt new file mode 100644 index 0000000..dc1d28e --- /dev/null +++ b/03-cut-offs/bin/sudoku-25x25-1.txt @@ -0,0 +1,25 @@ +0 8 0 18 0 12 20 21 16 0 0 0 0 0 0 15 0 0 24 5 7 0 19 0 0 +0 0 15 9 5 4 0 23 0 18 12 20 0 21 16 7 0 0 0 2 1 0 0 0 0 +0 0 0 0 14 7 0 0 0 0 0 24 0 0 0 0 10 0 0 16 0 0 23 18 6 +21 20 12 0 16 0 0 25 14 22 7 0 0 19 2 4 0 23 8 6 0 24 0 0 0 +19 0 7 0 2 15 0 0 0 9 0 0 18 0 0 0 0 0 0 0 12 0 21 0 16 +0 23 6 0 0 16 0 0 0 20 14 25 11 0 0 5 24 7 3 9 2 19 1 0 17 +4 21 0 20 0 0 0 12 22 11 2 19 0 0 0 0 0 15 23 0 0 3 7 0 9 +12 25 14 0 22 0 19 0 0 13 0 0 24 7 0 0 0 0 21 10 6 0 15 8 18 +7 0 0 24 9 6 0 15 18 0 0 0 0 0 10 0 13 1 19 0 0 25 0 11 22 +0 19 2 13 17 5 0 0 9 0 6 0 8 15 0 14 11 12 25 22 16 0 0 0 10 +0 0 0 1 0 0 0 0 3 7 8 0 15 0 0 11 12 10 0 0 0 0 18 4 0 +10 16 11 12 25 0 14 22 19 1 0 0 0 17 3 20 0 0 6 21 8 0 0 0 0 +0 0 8 0 23 20 0 18 21 4 11 16 0 0 25 0 7 0 2 0 0 14 22 0 0 +0 0 0 0 0 0 0 0 0 0 0 0 0 22 19 8 15 0 5 23 24 0 17 0 3 +0 2 0 7 3 8 0 9 23 15 20 0 4 0 21 13 1 0 14 19 0 0 10 12 25 +2 0 0 0 0 18 15 5 0 23 0 0 21 6 20 17 0 14 1 13 0 12 0 25 11 +0 1 0 19 13 9 7 0 24 0 0 0 23 0 8 0 25 0 0 11 0 0 0 21 0 +0 0 10 0 0 0 0 0 0 0 17 1 0 14 0 18 0 5 15 8 9 0 0 0 0 +16 12 0 0 11 0 0 14 0 0 0 0 3 2 24 0 21 0 4 20 0 15 0 0 8 +0 0 0 0 0 10 0 0 0 0 0 12 0 0 11 9 3 0 0 0 17 0 14 19 13 +24 0 0 0 15 21 18 0 4 0 0 0 16 20 0 3 2 13 0 7 19 22 0 0 1 +0 18 0 0 0 0 10 0 0 16 0 22 0 0 0 23 0 24 9 0 3 17 0 0 0 +0 0 0 0 0 0 0 0 15 5 0 0 6 8 4 0 14 11 22 1 25 0 20 16 12 +0 22 19 14 1 3 0 0 0 0 23 9 5 0 15 25 0 20 10 12 21 0 8 0 4 +20 0 25 16 0 0 22 11 1 0 3 0 2 13 7 21 0 0 0 0 0 9 0 0 0 diff --git a/03-cut-offs/bin/sudoku-9x9-1.txt b/03-cut-offs/bin/sudoku-9x9-1.txt new file mode 100644 index 0000000..3101a00 --- /dev/null +++ b/03-cut-offs/bin/sudoku-9x9-1.txt @@ -0,0 +1,10 @@ +5 3 0 0 7 0 0 0 0 +6 0 0 1 9 5 3 0 0 +0 9 8 0 0 0 0 6 0 +8 0 0 0 6 0 0 0 3 +4 0 0 8 0 3 0 0 1 +7 0 0 0 2 0 0 0 6 +0 6 0 0 0 0 2 8 0 +0 0 0 4 1 9 0 0 5 +0 0 0 0 8 0 0 7 9 + diff --git a/03-cut-offs/mergesort-co/Makefile b/03-cut-offs/mergesort-co/Makefile new file mode 100644 index 0000000..84ee1cf --- /dev/null +++ b/03-cut-offs/mergesort-co/Makefile @@ -0,0 +1,20 @@ +BIN_DIR=../bin +PROGRAM=$(BIN_DIR)/mergesort-co + +CC=g++ +CFLAGS=-O3 -fopenmp +LDFLAGS=-fopenmp + +all: $(PROGRAM) + +$(PROGRAM): mergesort.cpp + $(CC) $(CFLAGS) $(VERBOSE) -o $@ $^ $(LDFLAGS) + +$(BIN_DIR): + mkdir $@ + +clean: + rm -rf $(PROGRAM) *.o + +wipe: clean + rm -rf *.out *.err diff --git a/03-cut-offs/mergesort-co/mergesort-solved.cpp b/03-cut-offs/mergesort-co/mergesort-solved.cpp new file mode 100644 index 0000000..842474b --- /dev/null +++ b/03-cut-offs/mergesort-co/mergesort-solved.cpp @@ -0,0 +1,272 @@ +/* +* This file is part of Christian's OpenMP software lab +* +* Copyright (C) 2016 by Christian Terboven +* Copyright (C) 2016 by Jonas Hahnfeld +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +* +*/ + +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include +#include + +#include + + +/** + * helper routine: check if array is sorted correctly + */ +bool isSorted(int ref[], int data[], const size_t size){ + std::sort(ref, ref + size); + for (size_t idx = 0; idx < size; ++idx){ + if (ref[idx] != data[idx]) { + return false; + } + } + return true; +} + + +/** + * sequential merge step (straight-forward implementation) + */ +void MsMergeSequential(int *out, int *in, long begin1, long end1, long begin2, long end2, long outBegin) { + long left = begin1; + long right = begin2; + + long idx = outBegin; + + while (left < end1 && right < end2) { + if (in[left] <= in[right]) { + out[idx] = in[left]; + left++; + } else { + out[idx] = in[right]; + right++; + } + idx++; + } + + while (left < end1) { + out[idx] = in[left]; + left++, idx++; + } + + while (right < end2) { + out[idx] = in[right]; + right++, idx++; + } +} + + +/** + * sequential MergeSort + */ +void MsSequential(int *array, int *tmp, bool inplace, long begin, long end) { + if (begin < (end - 1)) { + const long half = (begin + end) / 2; + MsSequential(array, tmp, !inplace, begin, half); + MsSequential(array, tmp, !inplace, half, end); + if (inplace) { + MsMergeSequential(array, tmp, begin, half, half, end, begin); + } else { + MsMergeSequential(tmp, array, begin, half, half, end, begin); + } + } else if (!inplace) { + tmp[begin] = array[begin]; + } +} + +/** + * parallel merge step (straight-forward implementation) + */ +void MsMergeParallel(int *out, int *in, long begin1, long end1, long begin2, long end2, long outBegin, int deep) { + if (deep) { + long half1, half2, tmp, count, step; + if ((end1 - begin1) < (end2 - begin2)) { + half2 = (begin2 + end2) / 2; + // find in[half2] in [begin1, end1) (std::upper_bound) + half1 = begin1, count = (end1 - begin1); + while (count > 0) { + step = count / 2; + tmp = half1 + step; + if (in[tmp] <= in[half2]) { + tmp++; + half1 = tmp; + count -= step + 1; + } else { + count = step; + } + } + } else { + half1 = (begin1 + end1) / 2; + // find in[half1] in [begin2, end2) (std::lower_bound) + half2 = begin2, count = (end2 - begin2); + while (count > 0) { + step = count / 2; + tmp = half2 + step; + if (in[tmp] < in[half1]) { + tmp++; + half2 = tmp; + count -= step + 1; + } else { + count = step; + } + } + } + + #pragma omp task default(shared) + { + MsMergeParallel(out, in, begin1, half1, begin2, half2, outBegin, deep - 1); + } + + long outBegin2 = outBegin + (half1 - begin1) + (half2 - begin2); + #pragma omp task default(shared) + { + MsMergeParallel(out, in, half1, end1, half2, end2, outBegin2, deep - 1); + } + + #pragma omp taskwait + } else { + MsMergeSequential(out, in, begin1, end1, begin2, end2, outBegin); + } +} + +/** + * OpenMP Task-parallel MergeSort + */ +void MsParallel(int *array, int *tmp, bool inplace, long begin, long end, int deep) { + if (begin < (end - 1)) { + long half = (begin + end) / 2; + if (deep){ + #pragma omp task default(shared) + { + MsParallel(array, tmp, !inplace, begin, half, deep - 1); + } + + #pragma omp task default(shared) + { + MsParallel(array, tmp, !inplace, half, end, deep - 1); + } + + #pragma omp taskwait + } + else { + MsSequential(array, tmp, !inplace, begin, half); + MsSequential(array, tmp, !inplace, half, end); + } + + if (inplace) { + MsMergeParallel(array, tmp, begin, half, half, end, begin, deep); + } else { + MsMergeParallel(tmp, array, begin, half, half, end, begin, deep); + } + } else if (!inplace) { + tmp[begin] = array[begin]; + } +} + + +/** + * OpenMP Task-parallel MergeSort + * startup routine containing the Parallel Region + */ +void MsParallelOmp(int *array, int *tmp, const size_t size) { + + // compute cut-off recursion level + const int iMinTask = (omp_get_max_threads() * 5); + int deep = 0; + while ((1 << deep) < iMinTask) deep += 1; + +#pragma omp parallel +#pragma omp master + { + MsParallel(array, tmp, true, 0, size, deep); + } +} + +/** + * @brief program entry point + */ +int main(int argc, char* argv[]) { + // variables to measure the elapsed time + struct timeval t1, t2; + double etime; + + // expect one command line arguments: array size + if (argc != 2) { + printf("Usage: MergeSort.exe \n"); + printf("\n"); + return EXIT_FAILURE; + } + else { + const size_t stSize = strtol(argv[1], NULL, 10); + int *data = (int*) malloc(stSize * sizeof(int)); + int *tmp = (int*) malloc(stSize * sizeof(int)); + int *ref = (int*) malloc(stSize * sizeof(int)); + + // first touch + #pragma omp parallel for + for (size_t idx = 0; idx < stSize; ++idx){ + data[idx] = 0; + tmp[idx] = 0; + } + + printf("Initialization...\n"); + + srand(95); + for (size_t idx = 0; idx < stSize; ++idx){ + data[idx] = (int) (stSize * (double(rand()) / RAND_MAX)); + } + std::copy(data, data + stSize, ref); + + double dSize = (stSize * sizeof(int)) / 1024 / 1024; + printf("Sorting %zu elements of type int (%f MiB)...\n", stSize, dSize); + + gettimeofday(&t1, NULL); + MsParallelOmp(data, tmp, stSize); + gettimeofday(&t2, NULL); + + etime = (t2.tv_sec - t1.tv_sec) * 1000 + (t2.tv_usec - t1.tv_usec) / 1000; + etime = etime / 1000; + + printf("done, took %f sec. Verification...", etime); + if (isSorted(ref, data, stSize)) { + printf(" successful.\n"); + } + else { + printf(" FAILED.\n"); + } + + free(data); + free(tmp); + free(ref); + } + + return EXIT_SUCCESS; +} diff --git a/03-cut-offs/mergesort-co/mergesort.cpp b/03-cut-offs/mergesort-co/mergesort.cpp new file mode 100644 index 0000000..1417298 --- /dev/null +++ b/03-cut-offs/mergesort-co/mergesort.cpp @@ -0,0 +1,174 @@ +/* +* This file is part of Christian's OpenMP software lab +* +* Copyright (C) 2016 by Christian Terboven +* Copyright (C) 2016 by Jonas Hahnfeld +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +* +*/ + +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include +#include + + + +/** + * helper routine: check if array is sorted correctly + */ +bool isSorted(int ref[], int data[], const size_t size){ + std::sort(ref, ref + size); + for (size_t idx = 0; idx < size; ++idx){ + if (ref[idx] != data[idx]) { + return false; + } + } + return true; +} + + +/** + * sequential merge step (straight-forward implementation) + */ +// TODO: cut-off could also apply here (extra parameter?) +// TODO: optional: we can also break merge in two halves +void MsMergeSequential(int *out, int *in, long begin1, long end1, long begin2, long end2, long outBegin) { + long left = begin1; + long right = begin2; + + long idx = outBegin; + + while (left < end1 && right < end2) { + if (in[left] <= in[right]) { + out[idx] = in[left]; + left++; + } else { + out[idx] = in[right]; + right++; + } + idx++; + } + + while (left < end1) { + out[idx] = in[left]; + left++, idx++; + } + + while (right < end2) { + out[idx] = in[right]; + right++, idx++; + } +} + + +/** + * sequential MergeSort + */ +// TODO: remember one additional parameter (depth) +// TODO: recursive calls could be taskyfied +// TODO: task synchronization also is required +void MsSequential(int *array, int *tmp, bool inplace, long begin, long end) { + if (begin < (end - 1)) { + const long half = (begin + end) / 2; + MsSequential(array, tmp, !inplace, begin, half); + MsSequential(array, tmp, !inplace, half, end); + if (inplace) { + MsMergeSequential(array, tmp, begin, half, half, end, begin); + } else { + MsMergeSequential(tmp, array, begin, half, half, end, begin); + } + } else if (!inplace) { + tmp[begin] = array[begin]; + } +} + + +/** + * Serial MergeSort + */ +// TODO: this function should create the parallel region +// TODO: good point to compute a good depth level (cut-off) +void MsSerial(int *array, int *tmp, const size_t size) { + + // TODO: parallel version of MsSequential will receive one more parameter: 'depth' (used as cut-off) + MsSequential(array, tmp, true, 0, size); +} + + +/** + * @brief program entry point + */ +int main(int argc, char* argv[]) { + // variables to measure the elapsed time + struct timeval t1, t2; + double etime; + + // expect one command line arguments: array size + if (argc != 2) { + printf("Usage: MergeSort.exe \n"); + printf("\n"); + return EXIT_FAILURE; + } + else { + const size_t stSize = strtol(argv[1], NULL, 10); + int *data = (int*) malloc(stSize * sizeof(int)); + int *tmp = (int*) malloc(stSize * sizeof(int)); + int *ref = (int*) malloc(stSize * sizeof(int)); + + printf("Initialization...\n"); + + srand(95); + for (size_t idx = 0; idx < stSize; ++idx){ + data[idx] = (int) (stSize * (double(rand()) / RAND_MAX)); + } + std::copy(data, data + stSize, ref); + + double dSize = (stSize * sizeof(int)) / 1024 / 1024; + printf("Sorting %zu elements of type int (%f MiB)...\n", stSize, dSize); + + gettimeofday(&t1, NULL); + MsSerial(data, tmp, stSize); + gettimeofday(&t2, NULL); + + etime = (t2.tv_sec - t1.tv_sec) * 1000 + (t2.tv_usec - t1.tv_usec) / 1000; + etime = etime / 1000; + + printf("done, took %f sec. Verification...", etime); + if (isSorted(ref, data, stSize)) { + printf(" successful.\n"); + } + else { + printf(" FAILED.\n"); + } + + free(data); + free(tmp); + free(ref); + } + + return EXIT_SUCCESS; +} diff --git a/03-cut-offs/sudoku-co/Makefile b/03-cut-offs/sudoku-co/Makefile new file mode 100644 index 0000000..d3b68dd --- /dev/null +++ b/03-cut-offs/sudoku-co/Makefile @@ -0,0 +1,29 @@ +BIN_DIR=../bin +PROGRAM=$(BIN_DIR)/sudoku-co + +CC=g++ +CFLAGS=-c -O3 -fopenmp -DUSE_SEQUENTIAL_CUTOFF +LDFLAGS=-fopenmp + +OBJS=SudokuBoard.o sudoku.o + +all: $(PROGRAM) + +$(PROGRAM): $(OBJS) + $(CC) -o $@ $^ $(LDFLAGS) + +%.o: %.cpp + $(CC) $(CFLAGS) -o $@ $^ + +sudoku.cpp: + @cat README.md + @false + +$(BIN_DIR): + mkdir $@ + +clean: + rm -rf $(PROGRAM) *.o + +wipe: clean + rm -rf *.out *.err diff --git a/03-cut-offs/sudoku-co/README.md b/03-cut-offs/sudoku-co/README.md new file mode 100644 index 0000000..422f942 --- /dev/null +++ b/03-cut-offs/sudoku-co/README.md @@ -0,0 +1,5 @@ +==================================================================================================== +Copy your previous version of 01-tasks/sudoku-tsk exercise (sudoku.c) and modify the code in order +to use a native OpenMP cut-off mechanism (ie, the 'if' or the 'final' clauses) instead of the +manual cut-off version implemented as: "USE_SEQUENTIAL_CUTOFF". +==================================================================================================== diff --git a/03-cut-offs/sudoku-co/SudokuBoard.cpp b/03-cut-offs/sudoku-co/SudokuBoard.cpp new file mode 100644 index 0000000..139f4fd --- /dev/null +++ b/03-cut-offs/sudoku-co/SudokuBoard.cpp @@ -0,0 +1,119 @@ +/* + * This file is part of Christian's OpenMP Task-parallel Sudoku Solver + * + * Copyright (C) 2013 by Christian Terboven + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "SudokuBoard.h" + +#include + +CSudokuBoard::CSudokuBoard(int fsize, int bsize) + : field_size(fsize), block_size(bsize), solutions(-1) +{ + field = new int[field_size*field_size]; +} + + +CSudokuBoard::CSudokuBoard(const CSudokuBoard& other) + : field_size(other.getFieldSize()), block_size(other.getBlockSize()), solutions(other.getNumSolutions()) +{ + field = new int[field_size*field_size]; + std::memcpy(field, other.field, sizeof(int) * field_size*field_size); +} + + +CSudokuBoard::~CSudokuBoard(void) +{ + delete[] field; +} + + +bool CSudokuBoard::loadFromFile(char *filename) +{ + std::ifstream ifile(filename); + + if (!ifile) { + std::cout << "There was an error opening the input file " << filename << std::endl; + std::cout << std::endl; + return false; + } + + for (int i = 0; i < this->field_size; i++) { + for (int j = 0; j < this->field_size; j++) { + ifile >> this->field[ACCESS(i,j)]; + } + } + + return true; +} + + +void CSudokuBoard::printBoard() +{ + for(int i = 0; i < field_size; i++) { + for(int j = 0; j < field_size; j++) { + std::cout << std::setw(3) << + this->field[ACCESS(i,j)] + << " "; + } + std::cout << std::endl; + } +} + + +bool CSudokuBoard::check(int x, int y, int value) +{ + if(checkHorizontal(y, value)) + return true; + if(checkVertical(x, value)) + return true; + if(checkBox(x, y, value)) + return true; + return false; +} + + +bool CSudokuBoard::checkHorizontal(int y, int value) { + for(int i = 0; i < field_size; i++) + if(field[ACCESS(y,i)] == value) + return true; + return false; +} + + +bool CSudokuBoard::checkVertical(int x, int value) { + for(int i = 0; i < field_size; i++) + if(field[ACCESS(i,x)] == value) + return true; + return false; +} + + +bool CSudokuBoard::checkBox(int x, int y, int value) { + // find suitable box edge + int x_box = (int)(x / block_size) * block_size; + int y_box = (int)(y / block_size) * block_size; + + for(int i = y_box; i < y_box + block_size; i++) + for(int j = x_box; j < x_box + block_size; j++) + if(field[ACCESS(i,j)] == value) + return true; + + return false; +} diff --git a/03-cut-offs/sudoku-co/SudokuBoard.h b/03-cut-offs/sudoku-co/SudokuBoard.h new file mode 100644 index 0000000..f6c9c74 --- /dev/null +++ b/03-cut-offs/sudoku-co/SudokuBoard.h @@ -0,0 +1,96 @@ +/* + * This file is part of Christian's OpenMP Task-parallel Sudoku Solver + * + * Copyright (C) 2013 by Christian Terboven + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include +#include +#include + +#define ACCESS(x, y) (this->field_size*(x) + (y)) + +/** + * @brief Representation of the Sudoku board, includes solver and utility functions + */ +class CSudokuBoard +{ +public: + CSudokuBoard(int fsize, int bsize); + CSudokuBoard(const CSudokuBoard& other); + ~CSudokuBoard(void); + + inline int getNumSolutions() const { + return this->solutions; + } + + inline void incrementSolutionCounter() { + if (this->solutions == -1) + this->solutions = 1; + else + this->solutions++; + } + + inline int getFieldSize() const { + return this->field_size; + } + + inline int getBlockSize() const { + return this->block_size; + } + + inline int get(int x, int y) const { + return this->field[ACCESS(x,y)]; + } + + inline void set(int x, int y, int value) { + this->field[ACCESS(x, y)] = value; + } + + /** + * Read Sudoku template from file + * @param filename name of file to read input from + * @return true if reading file was successful, false if not + */ + bool loadFromFile(char *filename); + + /** + * Print the Sudoku board to stdout + */ + void printBoard(); + + /** + * Check whether value already used + * @return true if already used, false if not + */ + bool check(int x, int y, int value); + + +private: + bool checkHorizontal(int y, int value); + bool checkVertical(int x, int value); + bool checkBox(int x, int y, int value); + + int field_size; + int block_size; + + int *field; + + int solutions; +}; diff --git a/03-cut-offs/sudoku-co/sudoku-solved.cpp b/03-cut-offs/sudoku-co/sudoku-solved.cpp new file mode 100644 index 0000000..bd1285b --- /dev/null +++ b/03-cut-offs/sudoku-co/sudoku-solved.cpp @@ -0,0 +1,177 @@ +/* + * This file is part of Christian's OpenMP Task-parallel Sudoku Solver + * + * Copyright (C) 2013 by Christian Terboven + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include + +#include +#include + +#include + +#include "SudokuBoard.h" + + +int found_sudokus = 0; + + +/** + * Solve the Sudoku puzzle by finding and counting all solutions (if all are requested) + */ +bool solve(int x, int y, CSudokuBoard* sudoku, bool printAnyFoundSolution = true, bool findAllSolutions = true) +{ + if(x == sudoku->getFieldSize()) { // end of line + y++; + x = 0; + if(y == sudoku->getFieldSize()) // end + return true; + } + + if(sudoku->get(y, x) > 0) // field already set + return solve(x+1, y, sudoku, printAnyFoundSolution, findAllSolutions); // tackle next field + + for(int i = 1; i <= sudoku->getFieldSize(); i++) { // try all numbers + if(!sudoku->check(x, y, i)) { + sudoku->set(y, x, i); // if number fits, set it + if(solve(x+1, y, sudoku, printAnyFoundSolution, findAllSolutions)) { // tackle next field +#pragma omp atomic + found_sudokus++; + sudoku->incrementSolutionCounter(); // solution found :-) + if (printAnyFoundSolution) { + std::cout << "The following is a valid solution:" << std::endl; + sudoku->printBoard(); + std::cout << std::endl; + } + if (!findAllSolutions) { + return true; // return, as only one solution was asked for + } + } + } + } + + sudoku->set(y, x, 0); // no solution found, reset field + return false; +} + + + +/** + * Solve the Sudoku puzzle in parallel by finding and counting all solutions + */ +bool solve_parallel(int x, int y, CSudokuBoard* sudoku, bool printAnyFoundSolution = true) +{ + if (x == sudoku->getFieldSize()) { // end of line + y++; + x = 0; + if(y == sudoku->getFieldSize()) // end + return true; + } + + if (sudoku->get(y, x) > 0) { // field already set + return solve_parallel(x+1, y, sudoku, printAnyFoundSolution); // tackle next field + } + +//#if USE_SEQUENTIAL_CUTOFF +// if ( y > 1 ) { +// return solve(x, y, sudoku, printAnyFoundSolution); +// } +//#endif + + for (int i = 1; i <= sudoku->getFieldSize(); i++) { // try all numbers + if (!sudoku->check(x, y, i)) { +#pragma omp task firstprivate(i,x,y,sudoku) final(y > 0) + { + CSudokuBoard new_sudoku(*sudoku); + new_sudoku.set(y, x, i); // if number fits, set it + + if (solve_parallel(x+1, y, &new_sudoku, printAnyFoundSolution)) { // tackle next field +#pragma omp atomic + found_sudokus++; + // sudoku->incrementSolutionCounter(); // solution found :-) + if (printAnyFoundSolution) { + std::cout << "The following is a valid solution:" << std::endl; + new_sudoku.printBoard(); + std::cout << std::endl; + } + } + } + } + } + +#pragma omp taskwait + sudoku->set(y, x, 0); // no solution found, reset field + return false; +} + + + +/** @brief program entry point + */ +int main(int argc, char* argv[]) { + // measure the time + double t3, t4; + int nthreads = 0; + + // expect three command line arguments: field size, block size, and input file + if (argc != 4) { + std::cout << "Usage: sudoku.exe " << std::endl; + std::cout << std::endl; + return -1; + } + else { + CSudokuBoard *sudoku1 = new CSudokuBoard(atoi(argv[1]), atoi(argv[2])); + if (!sudoku1->loadFromFile(argv[3])) { + std::cout << "There was an error reading a Sudoku template from " << argv[3] << std::endl; + std::cout << std::endl; + return -1; + } + + // print the Sudoku board template + std::cout << "Given Sudoku template" << std::endl; + sudoku1->printBoard(); + + // solve the Sudoku by finding (and printing) all solutions + t3 = omp_get_wtime(); + #pragma omp parallel sections + { + solve_parallel(0, 0, sudoku1, false); + } + t4 = omp_get_wtime(); + +#pragma omp parallel + { +#pragma omp master + nthreads = omp_get_num_threads(); + } + + std::cout << std::endl; + std::cout << "In total there were " << sudoku1->getNumSolutions() << " solutions found in serial." << std::endl; + std::cout << "In total there were " << found_sudokus << " solutions found in parallel " << std::endl; + std::cout << std::endl; + + delete sudoku1; + } + + // print the time + std::cout << "Parallel computation took " << t4 - t3 << " seconds (" + << nthreads << " threads)." << std::endl << std::endl; + + return 0; +} diff --git a/04-matmul/Makefile b/04-matmul/Makefile new file mode 100644 index 0000000..18f92b9 --- /dev/null +++ b/04-matmul/Makefile @@ -0,0 +1,23 @@ +BIN_DIR=. +PROGRAM=$(BIN_DIR)/matmul + +MKL_HOME=$(MKLROOT) + +CC=gcc +CFLAGS=-O3 -std=gnu99 -fopenmp +LDFLAGS=-fopenmp +#VERBOSE=-DVERBOSE + +all: $(PROGRAM) + +$(PROGRAM): matmul.c + $(CC) $(CFLAGS) $(VERBOSE) -o $@ $^ $(LDFLAGS) + +$(BIN_DIR): + mkdir $@ + +clean: + rm -rf $(PROGRAM) *.o + +wipe: clean + rm -rf *.out *.err diff --git a/04-matmul/matmul.c b/04-matmul/matmul.c new file mode 100644 index 0000000..49ba381 --- /dev/null +++ b/04-matmul/matmul.c @@ -0,0 +1,174 @@ +//===-- matmul.c - Different implementations of matrix multiplies -*- C -*-===// +// +// Part of the LOMP Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include + +#define N 3072 + +#define DUMP_MATRIX 0 +#define DUMP_TASKS 0 + +void matmul_seq(float * C, float * A, float * B, size_t n) { + for (size_t i = 0; i < n; ++i) { + for (size_t k = 0; k < n; ++k) { + for (size_t j = 0; j < n; ++j) { + C[i * n + j] += A[i * n + k] * B[k * n + j]; + } + } + } +} + +void matmul_par(float * C, float * A, float * B, size_t n) { +#pragma omp parallel for schedule(static,8) firstprivate(n) + for (size_t i = 0; i < n; ++i) { + for (size_t k = 0; k < n; ++k) { + for (size_t j = 0; j < n; ++j) { + C[i * n + j] += A[i * n + k] * B[k * n + j]; + } + } + } +} + +void matmul_tloop(float * C, float * A, float * B, size_t n) { +#pragma omp parallel firstprivate(n) +#pragma omp single nowait +#pragma omp taskloop + for (size_t i = 0; i < n; ++i) { + for (size_t k = 0; k < n; ++k) { + for (size_t j = 0; j < n; ++j) { + C[i * n + j] += A[i * n + k] * B[k * n + j]; + } + } + } +} + +void matmul_task(float * C, float * A, float * B, size_t n) { + const size_t bf = 256; + if (n % bf != 0) { + printf("Blocking factor does not divide matrix size!\n"); + exit(EXIT_FAILURE); + } +#pragma omp parallel firstprivate(n, bf) +#pragma omp master + { + // work on the blocks of the matrix + for (size_t ib = 0; ib < n; ib += bf) + for (size_t kb = 0; kb < n; kb += bf) + for (size_t jb = 0; jb < n; jb += bf) { +#pragma omp task firstprivate(ib, kb, jb) firstprivate(n, bf) \ + depend(inout:C[ib * n + jb:bf]) \ + depend(in:A[ib * n + kb:bf]) \ + depend(in:B[kb * n + jb:bf]) + { +#if DUMP_TASKS + printf("task: C[%ld,%ld] += A[%ld,%ld] o B[%ld,%ld]\n", ib, jb, ib, kb, kb, jb); +#endif + // work on a single block`` + for (size_t i = ib; i < (ib + bf); ++i) + for (size_t k = kb; k < (kb + bf); ++k) + for (size_t j = jb; j < (jb + bf); ++j) + C[i * n + j] += A[i * n + k] * B[k * n + j]; + } + } + } +} + +void init_mat(float * C, float * A, float * B, size_t n) { + for (size_t i = 0; i < n; ++i) { + for (size_t j = 0; j < n; ++j) { + C[i * n + j] = 0.0; + A[i * n + j] = 0.5; + B[i * n + j] = 0.25; + } + } +} + +void dump_mat(float * mtx, size_t n) { + for (size_t i = 0; i < n; ++i) { + for (size_t j = 0; j < n; ++j) { + printf("%f ", mtx[i * n + j]); + } + printf("\n"); + } +} + +float sum_mat(float * mtx, size_t n) { + float sum = 0.0; + for (size_t i = 0; i < n; ++i) { + for (size_t j = 0; j < n; ++j) { + sum += mtx[i * n + j]; + } + } + return sum; +} + +int main(int argc, char * argv[]) { + double ts, te; + double t_seq; + + float * C; + float * A; + float * B; + + // If number of arguments is not 1, print help + if (argc != 2) { + printf("%s: matrix_size\n", argv[0]); + return EXIT_FAILURE; + } + const int n = atoi(argv[1]); // matrix size + + C = (float *) malloc(sizeof(*C) * n * n); + A = (float *) malloc(sizeof(*A) * n * n); + B = (float *) malloc(sizeof(*B) * n * n); + + init_mat(C, A, B, n); + ts = omp_get_wtime(); + matmul_seq(C, A, B, n); + te = omp_get_wtime(); +#if DUMP_MATRIX + dump_mat(C, n); +#endif + t_seq = te - ts; + printf("Sum of matrix (serial): %f, wall time %lf, speed-up %.2lf\n", + sum_mat(C, n), (te-ts), t_seq / (te-ts)); + + init_mat(C, A, B, n); + ts = omp_get_wtime(); + matmul_par(C, A, B, n); + te = omp_get_wtime(); +#if DUMP_MATRIX + dump_mat(C, n); +#endif + printf("Sum of matrix (parallel): %f, wall time %lf, speed-up %.2lf\n", + sum_mat(C, n), (te-ts), t_seq / (te-ts)); + + init_mat(C, A, B, n); + ts = omp_get_wtime(); + matmul_tloop(C, A, B, n); + te = omp_get_wtime(); +#if DUMP_MATRIX + dump_mat(C, n); +#endif + printf("Sum of matrix (taskloop): %f, wall time %lf, speed-up %.2lf\n", + sum_mat(C, n), (te-ts), t_seq / (te-ts)); + + init_mat(C, A, B, n); + ts = omp_get_wtime(); + matmul_task(C, A, B, n); + te = omp_get_wtime(); +#if DUMP_MATRIX + dump_mat(C, n); +#endif + printf("Sum of matrix (tasks): %f, wall time %lf, speed-up %.2lf\n", + sum_mat(C, n), (te-ts), t_seq / (te-ts)); + + return EXIT_SUCCESS; +} diff --git a/04-matmul/solution/matmul.c b/04-matmul/solution/matmul.c new file mode 100644 index 0000000..e2a696b --- /dev/null +++ b/04-matmul/solution/matmul.c @@ -0,0 +1,165 @@ +//===-- matmul.c - Different implementations of matrix multiplies -*- C -*-===// +// +// Part of the LOMP Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include + +#define N 3072 + +#define DUMP_MATRIX 0 +#define DUMP_TASKS 0 + +void matmul_seq(float * C, float * A, float * B, size_t n) { + for (size_t i = 0; i < n; ++i) { + for (size_t k = 0; k < n; ++k) { + for (size_t j = 0; j < n; ++j) { + C[i * n + j] += A[i * n + k] * B[k * n + j]; + } + } + } +} + +void matmul_par(float * C, float * A, float * B, size_t n) { +#pragma omp parallel for schedule(static,8) firstprivate(n) + for (size_t i = 0; i < n; ++i) { + for (size_t k = 0; k < n; ++k) { + for (size_t j = 0; j < n; ++j) { + C[i * n + j] += A[i * n + k] * B[k * n + j]; + } + } + } +} + +void matmul_tloop(float * C, float * A, float * B, size_t n) { + for (size_t i = 0; i < n; ++i) { + for (size_t k = 0; k < n; ++k) { + for (size_t j = 0; j < n; ++j) { + C[i * n + j] += A[i * n + k] * B[k * n + j]; + } + } + } +} + +void matmul_task(float * C, float * A, float * B, size_t n) { + const size_t bf = 256; + if (n % bf != 0) { + printf("Blocking factor does not divide matrix size!\n"); + exit(EXIT_FAILURE); + } + { + // work on the blocks of the matrix + for (size_t ib = 0; ib < n; ib += bf) + for (size_t kb = 0; kb < n; kb += bf) + for (size_t jb = 0; jb < n; jb += bf) { + { +#if DUMP_TASKS + printf("task: C[%ld,%ld] += A[%ld,%ld] o B[%ld,%ld]\n", ib, jb, ib, kb, kb, jb); +#endif + // work on a single block`` + for (size_t i = ib; i < (ib + bf); ++i) + for (size_t k = kb; k < (kb + bf); ++k) + for (size_t j = jb; j < (jb + bf); ++j) + C[i * n + j] += A[i * n + k] * B[k * n + j]; + } + } + } +} + +void init_mat(float * C, float * A, float * B, size_t n) { + for (size_t i = 0; i < n; ++i) { + for (size_t j = 0; j < n; ++j) { + C[i * n + j] = 0.0; + A[i * n + j] = 0.5; + B[i * n + j] = 0.25; + } + } +} + +void dump_mat(float * mtx, size_t n) { + for (size_t i = 0; i < n; ++i) { + for (size_t j = 0; j < n; ++j) { + printf("%f ", mtx[i * n + j]); + } + printf("\n"); + } +} + +float sum_mat(float * mtx, size_t n) { + float sum = 0.0; + for (size_t i = 0; i < n; ++i) { + for (size_t j = 0; j < n; ++j) { + sum += mtx[i * n + j]; + } + } + return sum; +} + +int main(int argc, char * argv[]) { + double ts, te; + double t_seq; + + float * C; + float * A; + float * B; + + // If number of arguments is not 1, print help + if (argc != 2) { + printf("%s: matrix_size\n", argv[0]); + return EXIT_FAILURE; + } + const int n = atoi(argv[1]); // matrix size + + C = (float *) malloc(sizeof(*C) * n * n); + A = (float *) malloc(sizeof(*A) * n * n); + B = (float *) malloc(sizeof(*B) * n * n); + + init_mat(C, A, B, n); + ts = omp_get_wtime(); + matmul_seq(C, A, B, n); + te = omp_get_wtime(); +#if DUMP_MATRIX + dump_mat(C, n); +#endif + t_seq = te - ts; + printf("Sum of matrix (serial): %f, wall time %lf, speed-up %.2lf\n", + sum_mat(C, n), (te-ts), t_seq / (te-ts)); + + init_mat(C, A, B, n); + ts = omp_get_wtime(); + matmul_par(C, A, B, n); + te = omp_get_wtime(); +#if DUMP_MATRIX + dump_mat(C, n); +#endif + printf("Sum of matrix (parallel): %f, wall time %lf, speed-up %.2lf\n", + sum_mat(C, n), (te-ts), t_seq / (te-ts)); + + init_mat(C, A, B, n); + ts = omp_get_wtime(); + matmul_tloop(C, A, B, n); + te = omp_get_wtime(); +#if DUMP_MATRIX + dump_mat(C, n); +#endif + printf("Sum of matrix (taskloop): %f, wall time %lf, speed-up %.2lf\n", + sum_mat(C, n), (te-ts), t_seq / (te-ts)); + + init_mat(C, A, B, n); + ts = omp_get_wtime(); + matmul_task(C, A, B, n); + te = omp_get_wtime(); +#if DUMP_MATRIX + dump_mat(C, n); +#endif + printf("Sum of matrix (tasks): %f, wall time %lf, speed-up %.2lf\n", + sum_mat(C, n), (te-ts), t_seq / (te-ts)); + + return EXIT_SUCCESS; +} diff --git a/07-fibonacci/Makefile b/07-fibonacci/Makefile new file mode 100644 index 0000000..ffc351b --- /dev/null +++ b/07-fibonacci/Makefile @@ -0,0 +1,20 @@ +PROG=fibonacci + +CC=gcc +CCFLAGS=-O3 -fopenmp +LDFLAGS=-fopenmp + +debug: + $(CC) $(CCFLAGS) -g -O0 ${PROG}.c -o ${PROG}.exe + +release: + $(CC) $(CCFLAGS) -O3 ${PROG}.c -o ${PROG}.exe + +solution: + $(CC) $(CCFLAGS) -O3 ${PROG}-solved.c -o ${PROG}-solved.exe + +run: + ./${PROG}.exe < input + +clean: + rm -f ${PROG}.exe ${PROG}-solved.exe ${PROG}.o *~ diff --git a/07-fibonacci/fibonacci-solved.c b/07-fibonacci/fibonacci-solved.c new file mode 100644 index 0000000..1ae868e --- /dev/null +++ b/07-fibonacci/fibonacci-solved.c @@ -0,0 +1,79 @@ +/* + * OpenMP lecture exercises + * Copyright (C) 2011 by Christian Terboven + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include + +int ser_fib(int n) +{ + int x, y; + if (n < 2) + return n; + + x = ser_fib(n - 1); + + y = ser_fib(n - 2); + + return x+y; +} + +int fib(int n) +{ + int x, y; + if (n < 2) + return n; + else if (n < 30) + return ser_fib(n); + + #pragma omp task shared(x) + { + x = fib(n - 1); + } + + #pragma omp task shared(y) + { + y = fib(n - 2); + } + + #pragma omp taskwait + + return x+y; + +} + + +int main() +{ + int n,fibonacci; + double starttime; + printf("Please insert n, to calculate fib(n): \n"); + scanf("%d",&n); + starttime=omp_get_wtime(); + + #pragma omp parallel + #pragma omp single + { + fibonacci=fib(n); + } + + printf("fib(%d)=%d \n",n,fibonacci); + printf("calculation took %lf sec\n",omp_get_wtime()-starttime); + return 0; +} diff --git a/07-fibonacci/fibonacci.c b/07-fibonacci/fibonacci.c new file mode 100644 index 0000000..bd9d333 --- /dev/null +++ b/07-fibonacci/fibonacci.c @@ -0,0 +1,53 @@ +/* + * OpenMP lecture exercises + * Copyright (C) 2011 by Christian Terboven + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include + + +int fib(int n) +{ + int x, y; + if (n < 2) + return n; + + x = fib(n - 1); + + y = fib(n - 2); + + return x+y; + +} + + +int main() +{ + int n,fibonacci; + double starttime; + printf("Please insert n, to calculate fib(n): \n"); + scanf("%d",&n); + starttime=omp_get_wtime(); + + fibonacci=fib(n); + + printf("fib(%d)=%d \n",n,fibonacci); + printf("calculation took %lf sec\n",omp_get_wtime()-starttime); + return 0; +} diff --git a/07-fibonacci/input b/07-fibonacci/input new file mode 100644 index 0000000..8f92bfd --- /dev/null +++ b/07-fibonacci/input @@ -0,0 +1 @@ +35 diff --git a/08-quicksort/Makefile b/08-quicksort/Makefile new file mode 100644 index 0000000..eec4117 --- /dev/null +++ b/08-quicksort/Makefile @@ -0,0 +1,20 @@ +PROG=quicksort + +CXX = g++ +CXXFLAGS=-O3 -fopenmp +LDFLAGS=-fopenmp + +debug: + $(CXX) $(CXXFLAGS) -g -O0 ${PROG}.cpp -o ${PROG}.exe + +release: + $(CXX) $(CXXFLAGS) -O3 ${PROG}.cpp -o ${PROG}.exe + +solution: + $(CXX) $(CXXFLAGS) -O3 ${PROG}-solved.cpp -o ${PROG}-solved.exe + +run: + ./${PROG}.exe + +clean: + rm -f ${PROG}.exe ${PROG}-solved.exe ${PROG}.o *~ diff --git a/08-quicksort/quicksort-solved.cpp b/08-quicksort/quicksort-solved.cpp new file mode 100644 index 0000000..461140b --- /dev/null +++ b/08-quicksort/quicksort-solved.cpp @@ -0,0 +1,98 @@ +#include +#include +#include +#include +#include +using namespace std; + +void init(int* array,int length){ + srand((unsigned)time(0)); + for(int i=0;iarray[i+1]){ + cout << "array[" << i << "] > array[" << i+1 << "]" << endl; + return false; + } + } + return true; +} + +int main(int argc, char *argv[]){ + int length; + int *toBeSorted; + double t1,t2; + if(argc < 2){ + length=6000000; + }else{ + length=atoi(argv[1]); + } + toBeSorted = new int[length]; + + init(toBeSorted, length); + cout << "Sorting an array of " << length << " elements." << endl; + + t1=omp_get_wtime(); + #pragma omp parallel + #pragma omp single + { + quicksort(toBeSorted,0,length-1); + } + t2=omp_get_wtime()-t1; + + cout << "quicksort took " << t2 << " sec. to complete" << endl; + if (!checkFn(toBeSorted, length)) { + cout << "validation failed!" << endl; + } + delete [] toBeSorted; +} diff --git a/08-quicksort/quicksort.cpp b/08-quicksort/quicksort.cpp new file mode 100644 index 0000000..8f2652c --- /dev/null +++ b/08-quicksort/quicksort.cpp @@ -0,0 +1,76 @@ +#include +#include +#include +#include +#include +using namespace std; + +void init(int* array,int length){ + srand((unsigned)time(0)); + for(int i=0;iarray[i+1]){ + cout << "array[" << i << "] > array[" << i+1 << "]" << endl; + return false; + } + } + return true; +} + +int main(int argc, char *argv[]){ + int length; + int *toBeSorted; + double t1,t2; + if(argc < 2){ + length=6000000; + }else{ + length=atoi(argv[1]); + } + toBeSorted = new int[length]; + + init(toBeSorted, length); + cout << "Sorting an array of " << length << " elements." << endl; + + t1=omp_get_wtime(); + quicksort(toBeSorted,0,length-1); + t2=omp_get_wtime()-t1; + + cout << "quicksort took " << t2 << " sec. to complete" << endl; + if (!checkFn(toBeSorted, length)) { + cout << "validation failed!" << endl; + } + delete [] toBeSorted; +} diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..ef5f7f5 --- /dev/null +++ b/Makefile @@ -0,0 +1,31 @@ +ALL_RECURSIVE=all-recursive clean-recursive + +TUT=SC19 + +DIRS=$(shell ls -d *-* | sed '/_/d' 2>/dev/null) + +all: $(CONFIG) all-recursive +clean: $(CONFIG) clean-recursive + +$(ALL_RECURSIVE): + @failcom='exit 1';\ + target=`echo $@ | sed s/-recursive//`; \ + for subdir in $(DIRS); do \ + (cd $$subdir && $(MAKE) $$target) || eval $$failcom; \ + done; + +dist: + @failcom='exit 1';\ + for subdir in $(DIRS); do \ + (cd $$subdir && $(MAKE) clean) || eval $$failcom; \ + done; \ + tar --exclude="*-solved*" -cvzf ../mt-openmp-$(TUT).tar.gz --transform s/mt-openmp/mt-openmp-$(TUT)/ ../mt-openmp/* + +dist-solved: + @failcom='exit 1';\ + for subdir in $(DIRS); do \ + (cd $$subdir && $(MAKE) clean) || eval $$failcom; \ + done; \ + tar -cvzf ../mt-openmp-$(TUT)-solved.tar.gz --transform s/mt-openmp/mt-openmp-$(TUT)-solved/ ../mt-openmp/* + + diff --git a/slides/day-1/00-OpenMP_Introduction.pdf b/slides/day-1/00-OpenMP_Introduction.pdf new file mode 100644 index 0000000..ca75e29 Binary files /dev/null and b/slides/day-1/00-OpenMP_Introduction.pdf differ diff --git a/slides/day-1/01-OpenMP_Hands-on.pdf b/slides/day-1/01-OpenMP_Hands-on.pdf new file mode 100644 index 0000000..e47232c Binary files /dev/null and b/slides/day-1/01-OpenMP_Hands-on.pdf differ diff --git a/slides/day-1/03-OpenMP-NUMA.pdf b/slides/day-1/03-OpenMP-NUMA.pdf new file mode 100644 index 0000000..6dcd9fe Binary files /dev/null and b/slides/day-1/03-OpenMP-NUMA.pdf differ diff --git a/slides/day-2/04-OpenMP-Task-Intro.pdf b/slides/day-2/04-OpenMP-Task-Intro.pdf new file mode 100644 index 0000000..df9d610 Binary files /dev/null and b/slides/day-2/04-OpenMP-Task-Intro.pdf differ diff --git a/slides/day-2/05-OpenMP-Task-Loops-Deps.pdf b/slides/day-2/05-OpenMP-Task-Loops-Deps.pdf new file mode 100644 index 0000000..5164661 Binary files /dev/null and b/slides/day-2/05-OpenMP-Task-Loops-Deps.pdf differ diff --git a/slides/day-2/06-OpenMP-SIMD.pdf b/slides/day-2/06-OpenMP-SIMD.pdf new file mode 100644 index 0000000..e3c3029 Binary files /dev/null and b/slides/day-2/06-OpenMP-SIMD.pdf differ diff --git a/slides/day-3/07-OpenMP-GPUs.pdf b/slides/day-3/07-OpenMP-GPUs.pdf new file mode 100644 index 0000000..d436f22 Binary files /dev/null and b/slides/day-3/07-OpenMP-GPUs.pdf differ diff --git a/slides/day-3/08-Openmp-Misc-Advanced-Outlook.pdf b/slides/day-3/08-Openmp-Misc-Advanced-Outlook.pdf new file mode 100644 index 0000000..e43d42e Binary files /dev/null and b/slides/day-3/08-Openmp-Misc-Advanced-Outlook.pdf differ