Exercises and slides

2024-04-26 15:47:20 +02:00 · 2024-04-26 15:47:20 +02:00 · 8307cf55bf
commit 8307cf55bf
70 changed files with 5333 additions and 0 deletions
--- a/00-worksharing/Makefile
+++ b/00-worksharing/Makefile
@ -0,0 +1,14 @@
+ALL_RECURSIVE=all-recursive clean-recursive 
+
+DIRS=$(shell ls -d *-* | sed '/_/d' 2>/dev/null)
+
+all:   $(CONFIG) all-recursive
+clean: $(CONFIG) clean-recursive
+
+$(ALL_RECURSIVE):
+	@failcom='exit 1';\
+   target=`echo $@ | sed s/-recursive//`; \
+   for subdir in $(DIRS); do \
+      (cd $$subdir && $(MAKE) $$target) || eval $$failcom; \
+   done;
+
--- a/00-worksharing/bin/.gitdir
+++ b/00-worksharing/bin/.gitdir
--- a/00-worksharing/bin/run-cholesky.sh
+++ b/00-worksharing/bin/run-cholesky.sh
@ -0,0 +1,30 @@
+#!/bin/bash
+#SBATCH --job-name=cholesky
+#SBATCH --workdir=.
+#SBATCH --output=cholesky_%j.out
+#SBATCH --error=cholesky_%j.err
+#SBATCH --cpus-per-task=48
+#SBATCH --ntasks=1
+#SBATCH --time=00:30:00
+#SBATCH --qos=debug
+
+export IFS=";"
+
+THREADS="01;02;04;06;08;10;12;14;16;18;20;22;24;26;28;30;32;34;36;38;40;42;44;46;48"
+#THREADS="01;02;04;06;08;10;12"
+
+#MSIZES="2048;4096;8192"
+MSIZES="8192"
+MSIZES="16384"
+MSIZES="2048"
+
+BSIZES="256"
+
+for MS in $MSIZES; do
+  for BS in $BSIZES; do
+    for threads in $THREADS; do
+      OMP_NUM_THREADS=$threads ./cholesky-for $MS $BS 0
+      OMP_NUM_THREADS=$threads ./cholesky-for-opt $MS $BS 0
+    done
+  done
+done
--- a/00-worksharing/cholesky-for-opt/Makefile
+++ b/00-worksharing/cholesky-for-opt/Makefile
@ -0,0 +1,23 @@
+BIN_DIR=../bin
+PROGRAM=$(BIN_DIR)/cholesky-for-opt
+
+MKL_HOME=$(MKLROOT)
+
+CC=gcc
+CFLAGS=-O3 -std=gnu99 -I$(MKL_HOME)/include -fopenmp
+LDFLAGS=-L$(MKL_HOME)/lib/intel64 -lmkl_sequential -lmkl_core -lmkl_rt -lpthread -lm -fopenmp
+#VERBOSE=-DVERBOSE
+
+all: $(PROGRAM) 
+
+$(PROGRAM): cholesky.c 
+	$(CC) $(CFLAGS) $(VERBOSE) -o $@ $^ $(LDFLAGS)
+
+$(BIN_DIR): 
+	mkdir $@
+
+clean:
+	rm -rf $(PROGRAM) *.o
+
+wipe: clean
+	rm -rf *.out *.err
--- a/00-worksharing/cholesky-for-opt/cholesky.c
+++ b/00-worksharing/cholesky-for-opt/cholesky.c
@ -0,0 +1,289 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <assert.h>
+
+#include <sys/time.h>
+#include <sys/times.h>
+
+#include <math.h>
+#include <mkl.h>
+
+#include "omp.h"
+
+#if !defined(_OPENMP)
+    int omp_get_max_threads() { return 1; }
+    int omp_get_num_threads() { return 1; }
+#endif
+
+void cholesky(int ts, int nt, double* Ah[nt][nt])
+{
+#ifdef VERBOSE
+	printf("> Computing Cholesky Factorization: indirect blocked matrix...\n");
+#endif
+
+   for (int k = 0; k < nt; k++) {
+      // Diagonal Block factorization: using LAPACK
+      LAPACKE_dpotrf(LAPACK_COL_MAJOR, 'L', ts, Ah[k][k], ts);
+
+      // Triangular systems
+      for (int i = k + 1; i < nt; i++) {
+         cblas_dtrsm(CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, ts, ts, 1.0, Ah[k][k], ts, Ah[k][i], ts);
+      }
+
+      // Update trailing matrix
+      for (int i = k + 1; i < nt; i++) {
+         for (int j = k + 1; j < i; j++) {
+            cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, ts, ts, ts, -1.0, Ah[k][i], ts, Ah[k][j], ts, 1.0, Ah[j][i], ts);
+         }
+         cblas_dsyrk(CblasColMajor, CblasLower, CblasNoTrans, ts, ts, -1.0, Ah[k][i], ts, 1.0, Ah[i][i], ts);
+      }
+   }
+
+#ifdef VERBOSE
+	printf("> ...end of Cholesky Factorization.\n");
+#endif
+}
+
+float get_time()
+{
+	static double gtod_ref_time_sec = 0.0;
+
+	struct timeval tv;
+	gettimeofday(&tv, NULL);
+
+	// If this is the first invocation of through dclock(), then initialize the
+	// "reference time" global variable to the seconds field of the tv struct.
+	if (gtod_ref_time_sec == 0.0) gtod_ref_time_sec = (double) tv.tv_sec;
+
+	// Normalize the seconds field of the tv struct so that it is relative to the
+	// "reference time" that was recorded during the first invocation of dclock().
+	const double norm_sec = (double) tv.tv_sec - gtod_ref_time_sec;
+
+	// Compute the number of seconds since the reference time.
+	const double t = norm_sec + tv.tv_usec * 1.0e-6;
+
+	return (float) t;
+}
+
+// Robust Check the factorization of the matrix A2
+// Using directly Fortran services: dlacpy_, dtrmm, dlange_
+static int check_factorization(int N, double *A1, double *A2, int LDA, char uplo, double eps)
+{
+#ifdef VERBOSE
+	printf("> Checking the Cholesky Factorization... \n");
+#endif
+	char NORM = 'I', ALL = 'A', UP = 'U', LO = 'L', TR = 'T', NU = 'N', RI = 'R';
+
+	double *Residual = (double *) malloc(N*N*sizeof(double));
+	double *L1       = (double *) malloc(N*N*sizeof(double));
+	double *L2       = (double *) malloc(N*N*sizeof(double));
+	double *work     = (double *) malloc(N*sizeof(double));
+
+	memset((void*)L1, 0, N*N*sizeof(double));
+	memset((void*)L2, 0, N*N*sizeof(double));
+
+	double alpha= 1.0;
+
+	dlacpy_(&ALL, &N, &N, A1, &LDA, Residual, &N);
+
+	/* Dealing with L'L or U'U  */
+	if (uplo == 'U'){
+		dlacpy_(&UP, &N, &N, A2, &LDA, L1, &N);
+		dlacpy_(&UP, &N, &N, A2, &LDA, L2, &N);
+		dtrmm(&LO, &uplo, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
+	} else{
+		dlacpy_(&LO, &N, &N, A2, &LDA, L1, &N);
+		dlacpy_(&LO, &N, &N, A2, &LDA, L2, &N);
+		dtrmm(&RI, &LO, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
+	}
+
+	/* Compute the Residual || A -L'L|| */
+	for (int i = 0; i < N; i++)
+		for (int j = 0; j < N; j++)
+			Residual[j*N+i] = L2[j*N+i] - Residual[j*N+i];
+
+	double Rnorm = dlange_(&NORM, &N, &N, Residual, &N, work);
+	double Anorm = dlange_(&NORM, &N, &N, A1, &N, work);
+
+#ifdef VERBOSE
+	printf("> - ||L'L-A||_oo/(||A||_oo.N.eps) = %e \n",Rnorm / (Anorm*N*eps));
+#endif
+
+	const int info_factorization = isnan(Rnorm/(Anorm*N*eps)) || isinf(Rnorm/(Anorm*N*eps)) || (Rnorm/(Anorm*N*eps) > 60.0);
+
+#ifdef VERBOSE
+	if ( info_factorization) printf("> - Factorization is suspicious!\n");
+	else printf("> - Factorization is CORRECT!\n");
+#endif
+
+	free(Residual); free(L1); free(L2); free(work);
+
+	return info_factorization;
+}
+
+void initialize_matrix(const int n, const int ts, double *matrix)
+{
+#ifdef VERBOSE
+	printf("> Initializing matrix with random values...\n");
+#endif
+
+	int ISEED[4] = {0,0,0,1};
+	int intONE=1;
+
+	for (int i = 0; i < n*n; i+=n) {
+		dlarnv_(&intONE, &ISEED[0], &n, &matrix[i]);
+	}
+
+	for (int i=0; i<n; i++) {
+		for (int j=0; j<n; j++) {
+			matrix[j*n+i] = matrix[j*n+i] + matrix[i*n+j];
+			matrix[i*n+j] = matrix[j*n+i];
+		}
+	}
+
+   // Diagonal values
+	for (int i = 0; i < n; i++) {
+		matrix[i*n+i] += (double) n;
+   }
+}
+
+void gather_block(const int N, const int ts, double *Alin, double *A)
+{
+	for (int i = 0; i < ts; i++)
+		for (int j = 0; j < ts; j++) {
+			A[i*ts + j] = Alin[i*N + j];
+		}
+}
+
+void scatter_block(const int N, const int ts, double *A, double *Alin)
+{
+	for (int i = 0; i < ts; i++)
+		for (int j = 0; j < ts; j++) {
+			Alin[i*N + j] = A[i*ts + j];
+		}
+}
+
+void convert_to_blocks(const int ts, const int DIM, const int N, double Alin[N][N], double *A[DIM][DIM])
+{
+#ifdef VERBOSE
+	printf("> Converting linear matrix to blocks...\n");
+#endif
+   for (int i = 0; i < DIM; i++)
+      for (int j = 0; j < DIM; j++) {
+         gather_block ( N, ts, &Alin[i*ts][j*ts], A[i][j]);
+      }
+}
+
+void convert_to_linear(const int ts, const int DIM, const int N, double *A[DIM][DIM], double Alin[N][N])
+{
+#ifdef VERBOSE
+	printf("> Converting blocked matrix to linear...\n");
+#endif
+   for (int i = 0; i < DIM; i++)
+      for (int j = 0; j < DIM; j++) {
+         scatter_block ( N, ts, A[i][j], (double *) &Alin[i*ts][j*ts]);
+      }
+}
+
+int main(int argc, char* argv[])
+{
+   char *result[3] = {"n/a","pass","FAIL"};
+	const double eps = pow(2.0, -53);
+
+   // If number of arguments is not 4, print help
+   if ( argc != 4) {
+      printf( "%s: matrix_size block_size check[0|1]?\n", argv[0] );
+      exit( -1 );
+   }
+
+   const int  n = atoi(argv[1]); // matrix size
+   const int ts = atoi(argv[2]); // tile size
+   int check    = atoi(argv[3]); // check result?
+
+   // Compute number of tiles
+   const int nt = n / ts;
+   assert((nt*ts) == n); // tile size should divide size
+
+   // Allocate matrix
+   double * const matrix = (double *) malloc(n * n * sizeof(double));
+   assert(matrix != NULL);
+
+   // Initialize matrix
+   initialize_matrix(n, ts, matrix);
+
+   // Allocate original matrix, and duplicate it, for debugging purposes
+   double * const original_matrix = (double *) malloc(n * n * sizeof(double));
+   assert(original_matrix != NULL);
+
+   // Save a copy of the original matrix
+   for (int i = 0; i < n * n; i++ ) {
+      original_matrix[i] = matrix[i];
+   }
+
+   // Set version description: Indirect blocked matrix
+   const char *version = "I-Blocked matrix";
+
+   // Allocate blocked matrix
+   double *Ah[nt][nt];
+   for (int i = 0; i < nt; i++) {
+      for (int j = 0; j < nt; j++) {
+         Ah[i][j] = malloc(ts * ts * sizeof(double));
+         assert(Ah[i][j] != NULL);
+      }
+   }
+
+   // ---------------------------------------
+   // Convert, compute (time), and re-convert
+   // ---------------------------------------
+   convert_to_blocks(ts, nt, n, (double(*)[n]) matrix, Ah);
+   const float tref = get_time();
+   cholesky(ts, nt, (double* (*)[nt]) Ah);
+   const float time = get_time() - tref;
+   convert_to_linear(ts, nt, n, Ah, (double (*)[n]) matrix);
+
+   // Free blocked matrix
+   for (int i = 0; i < nt; i++) {
+      for (int j = 0; j < nt; j++) {
+         assert(Ah[i][j] != NULL);
+         free(Ah[i][j]);
+      }
+   }
+
+   // Check result, if requested
+   if ( check ) {
+      const char uplo = 'L';
+      if ( check_factorization( n, original_matrix, matrix, n, uplo, eps) ) check++;
+   }
+
+   // Free original matrix, not needed anymore
+   free(original_matrix);
+
+   // Compute GFLOPs (Not verified)
+   float gflops = (((1.0 / 3.0) * n * n * n) / ((time) * 1.0e+9));
+
+   // Print results
+#ifdef VERBOSE
+   printf( "\n" );
+   printf( "============ CHOLESKY RESULTS ============\n" );
+   printf( "  test                  %s\n", argv[0]);
+   printf( "  version               %s\n", version);
+   printf( "  matrix size:          %dx%d\n", n, n);
+   printf( "  tile size:            %dx%d\n", ts, ts);
+   printf( "  number of threads:    %d\n", omp_get_max_threads());
+   printf( "  time (s):             %f\n", time);
+   printf( "  performance (gflops): %f\n", gflops);
+   printf( "  check:                %s\n", result[check]);
+   printf( "==========================================\n" );
+#else
+   printf("test, %s, version, %s, n, %d, ts, %d, num_threads, %d, gflops, %f, time, %f, check, %s\n", argv[0], version, n, ts, omp_get_max_threads(), gflops, time, result[check]);
+#endif
+
+   // Free matrix
+   free(matrix);
+
+   return 0;
+}
+
--- a/00-worksharing/cholesky-for-opt/solution/cholesky.c
+++ b/00-worksharing/cholesky-for-opt/solution/cholesky.c
@ -0,0 +1,293 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <assert.h>
+
+#include <sys/time.h>
+#include <sys/times.h>
+
+#include <math.h>
+#include <mkl.h>
+
+#include "omp.h"
+
+#if !defined(_OPENMP)
+    int omp_get_max_threads() { return 1; }
+    int omp_get_num_threads() { return 1; }
+#endif
+
+void cholesky(int ts, int nt, double* Ah[nt][nt])
+{
+#ifdef VERBOSE
+	printf("> Computing Cholesky Factorization: indirect blocked matrix...\n");
+#endif
+
+   for (int k = 0; k < nt; k++) {
+      // Diagonal Block factorization: using LAPACK
+      LAPACKE_dpotrf(LAPACK_COL_MAJOR, 'L', ts, Ah[k][k], ts);
+
+      // Triangular systems
+      #pragma omp parallel for
+      for (int i = k + 1; i < nt; i++) {
+         cblas_dtrsm(CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, ts, ts, 1.0, Ah[k][k], ts, Ah[k][i], ts);
+      }
+
+      // Update trailing matrix
+      #pragma omp parallel
+      for (int i = k + 1; i < nt; i++) {
+         #pragma omp for schedule(dynamic, 1) nowait
+         for (int j = k + 1; j < i; j++) {
+            cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, ts, ts, ts, -1.0, Ah[k][i], ts, Ah[k][j], ts, 1.0, Ah[j][i], ts);
+         }
+         #pragma omp single nowait
+         cblas_dsyrk(CblasColMajor, CblasLower, CblasNoTrans, ts, ts, -1.0, Ah[k][i], ts, 1.0, Ah[i][i], ts);
+      }
+   }
+
+#ifdef VERBOSE
+	printf("> ...end of Cholesky Factorization.\n");
+#endif
+}
+
+float get_time()
+{
+	static double gtod_ref_time_sec = 0.0;
+
+	struct timeval tv;
+	gettimeofday(&tv, NULL);
+
+	// If this is the first invocation of through dclock(), then initialize the
+	// "reference time" global variable to the seconds field of the tv struct.
+	if (gtod_ref_time_sec == 0.0) gtod_ref_time_sec = (double) tv.tv_sec;
+
+	// Normalize the seconds field of the tv struct so that it is relative to the
+	// "reference time" that was recorded during the first invocation of dclock().
+	const double norm_sec = (double) tv.tv_sec - gtod_ref_time_sec;
+
+	// Compute the number of seconds since the reference time.
+	const double t = norm_sec + tv.tv_usec * 1.0e-6;
+
+	return (float) t;
+}
+
+// Robust Check the factorization of the matrix A2
+// Using directly Fortran services: dlacpy_, dtrmm, dlange_
+static int check_factorization(int N, double *A1, double *A2, int LDA, char uplo, double eps)
+{
+#ifdef VERBOSE
+	printf("> Checking the Cholesky Factorization... \n");
+#endif
+	char NORM = 'I', ALL = 'A', UP = 'U', LO = 'L', TR = 'T', NU = 'N', RI = 'R';
+
+	double *Residual = (double *) malloc(N*N*sizeof(double));
+	double *L1       = (double *) malloc(N*N*sizeof(double));
+	double *L2       = (double *) malloc(N*N*sizeof(double));
+	double *work     = (double *) malloc(N*sizeof(double));
+
+	memset((void*)L1, 0, N*N*sizeof(double));
+	memset((void*)L2, 0, N*N*sizeof(double));
+
+	double alpha= 1.0;
+
+	dlacpy_(&ALL, &N, &N, A1, &LDA, Residual, &N);
+
+	/* Dealing with L'L or U'U  */
+	if (uplo == 'U'){
+		dlacpy_(&UP, &N, &N, A2, &LDA, L1, &N);
+		dlacpy_(&UP, &N, &N, A2, &LDA, L2, &N);
+		dtrmm(&LO, &uplo, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
+	} else{
+		dlacpy_(&LO, &N, &N, A2, &LDA, L1, &N);
+		dlacpy_(&LO, &N, &N, A2, &LDA, L2, &N);
+		dtrmm(&RI, &LO, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
+	}
+
+	/* Compute the Residual || A -L'L|| */
+	for (int i = 0; i < N; i++)
+		for (int j = 0; j < N; j++)
+			Residual[j*N+i] = L2[j*N+i] - Residual[j*N+i];
+
+	double Rnorm = dlange_(&NORM, &N, &N, Residual, &N, work);
+	double Anorm = dlange_(&NORM, &N, &N, A1, &N, work);
+
+#ifdef VERBOSE
+	printf("> - ||L'L-A||_oo/(||A||_oo.N.eps) = %e \n",Rnorm / (Anorm*N*eps));
+#endif
+
+	const int info_factorization = isnan(Rnorm/(Anorm*N*eps)) || isinf(Rnorm/(Anorm*N*eps)) || (Rnorm/(Anorm*N*eps) > 60.0);
+
+#ifdef VERBOSE
+	if ( info_factorization) printf("> - Factorization is suspicious!\n");
+	else printf("> - Factorization is CORRECT!\n");
+#endif
+
+	free(Residual); free(L1); free(L2); free(work);
+
+	return info_factorization;
+}
+
+void initialize_matrix(const int n, const int ts, double *matrix)
+{
+#ifdef VERBOSE
+	printf("> Initializing matrix with random values...\n");
+#endif
+
+	int ISEED[4] = {0,0,0,1};
+	int intONE=1;
+
+	for (int i = 0; i < n*n; i+=n) {
+		dlarnv_(&intONE, &ISEED[0], &n, &matrix[i]);
+	}
+
+	for (int i=0; i<n; i++) {
+		for (int j=0; j<n; j++) {
+			matrix[j*n+i] = matrix[j*n+i] + matrix[i*n+j];
+			matrix[i*n+j] = matrix[j*n+i];
+		}
+	}
+
+   // Diagonal values
+	for (int i = 0; i < n; i++) {
+		matrix[i*n+i] += (double) n;
+   }
+}
+
+void gather_block(const int N, const int ts, double *Alin, double *A)
+{
+	for (int i = 0; i < ts; i++)
+		for (int j = 0; j < ts; j++) {
+			A[i*ts + j] = Alin[i*N + j];
+		}
+}
+
+void scatter_block(const int N, const int ts, double *A, double *Alin)
+{
+	for (int i = 0; i < ts; i++)
+		for (int j = 0; j < ts; j++) {
+			Alin[i*N + j] = A[i*ts + j];
+		}
+}
+
+void convert_to_blocks(const int ts, const int DIM, const int N, double Alin[N][N], double *A[DIM][DIM])
+{
+#ifdef VERBOSE
+	printf("> Converting linear matrix to blocks...\n");
+#endif
+   for (int i = 0; i < DIM; i++)
+      for (int j = 0; j < DIM; j++) {
+         gather_block ( N, ts, &Alin[i*ts][j*ts], A[i][j]);
+      }
+}
+
+void convert_to_linear(const int ts, const int DIM, const int N, double *A[DIM][DIM], double Alin[N][N])
+{
+#ifdef VERBOSE
+	printf("> Converting blocked matrix to linear...\n");
+#endif
+   for (int i = 0; i < DIM; i++)
+      for (int j = 0; j < DIM; j++) {
+         scatter_block ( N, ts, A[i][j], (double *) &Alin[i*ts][j*ts]);
+      }
+}
+
+int main(int argc, char* argv[])
+{
+   char *result[3] = {"n/a","pass","FAIL"};
+	const double eps = pow(2.0, -53);
+
+   // If number of arguments is not 4, print help
+   if ( argc != 4) {
+      printf( "%s: matrix_size block_size check[0|1]?\n", argv[0] );
+      exit( -1 );
+   }
+
+   const int  n = atoi(argv[1]); // matrix size
+   const int ts = atoi(argv[2]); // tile size
+   int check    = atoi(argv[3]); // check result?
+
+   // Compute number of tiles
+   const int nt = n / ts;
+   assert((nt*ts) == n); // tile size should divide size
+
+   // Allocate matrix
+   double * const matrix = (double *) malloc(n * n * sizeof(double));
+   assert(matrix != NULL);
+
+   // Initialize matrix
+   initialize_matrix(n, ts, matrix);
+
+   // Allocate original matrix, and duplicate it, for debugging purposes
+   double * const original_matrix = (double *) malloc(n * n * sizeof(double));
+   assert(original_matrix != NULL);
+
+   // Save a copy of the original matrix
+   for (int i = 0; i < n * n; i++ ) {
+      original_matrix[i] = matrix[i];
+   }
+
+   // Set version description: Indirect blocked matrix
+   const char *version = "I-Blocked matrix";
+
+   // Allocate blocked matrix
+   double *Ah[nt][nt];
+   for (int i = 0; i < nt; i++) {
+      for (int j = 0; j < nt; j++) {
+         Ah[i][j] = malloc(ts * ts * sizeof(double));
+         assert(Ah[i][j] != NULL);
+      }
+   }
+
+   // ---------------------------------------
+   // Convert, compute (time), and re-convert
+   // ---------------------------------------
+   convert_to_blocks(ts, nt, n, (double(*)[n]) matrix, Ah);
+   const float tref = get_time();
+   cholesky(ts, nt, (double* (*)[nt]) Ah);
+   const float time = get_time() - tref;
+   convert_to_linear(ts, nt, n, Ah, (double (*)[n]) matrix);
+
+   // Free blocked matrix
+   for (int i = 0; i < nt; i++) {
+      for (int j = 0; j < nt; j++) {
+         assert(Ah[i][j] != NULL);
+         free(Ah[i][j]);
+      }
+   }
+
+   // Check result, if requested
+   if ( check ) {
+      const char uplo = 'L';
+      if ( check_factorization( n, original_matrix, matrix, n, uplo, eps) ) check++;
+   }
+
+   // Free original matrix, not needed anymore
+   free(original_matrix);
+
+   // Compute GFLOPs (Not verified)
+   float gflops = (((1.0 / 3.0) * n * n * n) / ((time) * 1.0e+9));
+
+   // Print results
+#ifdef VERBOSE
+   printf( "\n" );
+   printf( "============ CHOLESKY RESULTS ============\n" );
+   printf( "  test                  %s\n", argv[0]);
+   printf( "  version               %s\n", version);
+   printf( "  matrix size:          %dx%d\n", n, n);
+   printf( "  tile size:            %dx%d\n", ts, ts);
+   printf( "  number of threads:    %d\n", omp_get_max_threads());
+   printf( "  time (s):             %f\n", time);
+   printf( "  performance (gflops): %f\n", gflops);
+   printf( "  check:                %s\n", result[check]);
+   printf( "==========================================\n" );
+#else
+   printf("test, %s, version, %s, n, %d, ts, %d, num_threads, %d, gflops, %f, time, %f, check, %s\n", argv[0], version, n, ts, omp_get_max_threads(), gflops, time, result[check]);
+#endif
+
+   // Free matrix
+   free(matrix);
+
+   return 0;
+}
+
--- a/00-worksharing/cholesky-for/Makefile
+++ b/00-worksharing/cholesky-for/Makefile
@ -0,0 +1,23 @@
+BIN_DIR=../bin
+PROGRAM=$(BIN_DIR)/cholesky-for
+
+MKL_HOME=$(MKLROOT)
+
+CC=gcc
+CFLAGS=-O3 -std=gnu99 -I$(MKL_HOME)/include -fopenmp
+LDFLAGS=-L$(MKL_HOME)/lib/intel64 -lmkl_sequential -lmkl_core -lmkl_rt -lpthread -lm -fopenmp
+#VERBOSE=-DVERBOSE
+
+all: $(PROGRAM) 
+
+$(PROGRAM): cholesky.c 
+	$(CC) $(CFLAGS) $(VERBOSE) -o $@ $^ $(LDFLAGS)
+
+$(BIN_DIR): 
+	mkdir $@
+
+clean:
+	rm -rf $(PROGRAM) *.o
+
+wipe: clean
+	rm -rf *.out *.err
--- a/00-worksharing/cholesky-for/cholesky.c
+++ b/00-worksharing/cholesky-for/cholesky.c
@ -0,0 +1,289 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <assert.h>
+
+#include <sys/time.h>
+#include <sys/times.h>
+
+#include <math.h>
+#include <mkl.h>
+
+#include "omp.h"
+
+#if !defined(_OPENMP)
+    int omp_get_max_threads() { return 1; }
+    int omp_get_num_threads() { return 1; }
+#endif
+
+void cholesky(int ts, int nt, double* Ah[nt][nt])
+{
+#ifdef VERBOSE
+	printf("> Computing Cholesky Factorization: indirect blocked matrix...\n");
+#endif
+
+   for (int k = 0; k < nt; k++) {
+      // Diagonal Block factorization: using LAPACK
+      LAPACKE_dpotrf(LAPACK_COL_MAJOR, 'L', ts, Ah[k][k], ts);
+
+      // Triangular systems
+      for (int i = k + 1; i < nt; i++) {
+         cblas_dtrsm(CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, ts, ts, 1.0, Ah[k][k], ts, Ah[k][i], ts);
+      }
+
+      // Update trailing matrix
+      for (int i = k + 1; i < nt; i++) {
+         for (int j = k + 1; j < i; j++) {
+            cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, ts, ts, ts, -1.0, Ah[k][i], ts, Ah[k][j], ts, 1.0, Ah[j][i], ts);
+         }
+         cblas_dsyrk(CblasColMajor, CblasLower, CblasNoTrans, ts, ts, -1.0, Ah[k][i], ts, 1.0, Ah[i][i], ts);
+      }
+   }
+
+#ifdef VERBOSE
+	printf("> ...end of Cholesky Factorization.\n");
+#endif
+}
+
+float get_time()
+{
+	static double gtod_ref_time_sec = 0.0;
+
+	struct timeval tv;
+	gettimeofday(&tv, NULL);
+
+	// If this is the first invocation of through dclock(), then initialize the
+	// "reference time" global variable to the seconds field of the tv struct.
+	if (gtod_ref_time_sec == 0.0) gtod_ref_time_sec = (double) tv.tv_sec;
+
+	// Normalize the seconds field of the tv struct so that it is relative to the
+	// "reference time" that was recorded during the first invocation of dclock().
+	const double norm_sec = (double) tv.tv_sec - gtod_ref_time_sec;
+
+	// Compute the number of seconds since the reference time.
+	const double t = norm_sec + tv.tv_usec * 1.0e-6;
+
+	return (float) t;
+}
+
+// Robust Check the factorization of the matrix A2
+// Using directly Fortran services: dlacpy_, dtrmm, dlange_
+static int check_factorization(int N, double *A1, double *A2, int LDA, char uplo, double eps)
+{
+#ifdef VERBOSE
+	printf("> Checking the Cholesky Factorization... \n");
+#endif
+	char NORM = 'I', ALL = 'A', UP = 'U', LO = 'L', TR = 'T', NU = 'N', RI = 'R';
+
+	double *Residual = (double *) malloc(N*N*sizeof(double));
+	double *L1       = (double *) malloc(N*N*sizeof(double));
+	double *L2       = (double *) malloc(N*N*sizeof(double));
+	double *work     = (double *) malloc(N*sizeof(double));
+
+	memset((void*)L1, 0, N*N*sizeof(double));
+	memset((void*)L2, 0, N*N*sizeof(double));
+
+	double alpha= 1.0;
+
+	dlacpy_(&ALL, &N, &N, A1, &LDA, Residual, &N);
+
+	/* Dealing with L'L or U'U  */
+	if (uplo == 'U'){
+		dlacpy_(&UP, &N, &N, A2, &LDA, L1, &N);
+		dlacpy_(&UP, &N, &N, A2, &LDA, L2, &N);
+		dtrmm(&LO, &uplo, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
+	} else{
+		dlacpy_(&LO, &N, &N, A2, &LDA, L1, &N);
+		dlacpy_(&LO, &N, &N, A2, &LDA, L2, &N);
+		dtrmm(&RI, &LO, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
+	}
+
+	/* Compute the Residual || A -L'L|| */
+	for (int i = 0; i < N; i++)
+		for (int j = 0; j < N; j++)
+			Residual[j*N+i] = L2[j*N+i] - Residual[j*N+i];
+
+	double Rnorm = dlange_(&NORM, &N, &N, Residual, &N, work);
+	double Anorm = dlange_(&NORM, &N, &N, A1, &N, work);
+
+#ifdef VERBOSE
+	printf("> - ||L'L-A||_oo/(||A||_oo.N.eps) = %e \n",Rnorm / (Anorm*N*eps));
+#endif
+
+	const int info_factorization = isnan(Rnorm/(Anorm*N*eps)) || isinf(Rnorm/(Anorm*N*eps)) || (Rnorm/(Anorm*N*eps) > 60.0);
+
+#ifdef VERBOSE
+	if ( info_factorization) printf("> - Factorization is suspicious!\n");
+	else printf("> - Factorization is CORRECT!\n");
+#endif
+
+	free(Residual); free(L1); free(L2); free(work);
+
+	return info_factorization;
+}
+
+void initialize_matrix(const int n, const int ts, double *matrix)
+{
+#ifdef VERBOSE
+	printf("> Initializing matrix with random values...\n");
+#endif
+
+	int ISEED[4] = {0,0,0,1};
+	int intONE=1;
+
+	for (int i = 0; i < n*n; i+=n) {
+		dlarnv_(&intONE, &ISEED[0], &n, &matrix[i]);
+	}
+
+	for (int i=0; i<n; i++) {
+		for (int j=0; j<n; j++) {
+			matrix[j*n+i] = matrix[j*n+i] + matrix[i*n+j];
+			matrix[i*n+j] = matrix[j*n+i];
+		}
+	}
+
+   // Diagonal values
+	for (int i = 0; i < n; i++) {
+		matrix[i*n+i] += (double) n;
+   }
+}
+
+void gather_block(const int N, const int ts, double *Alin, double *A)
+{
+	for (int i = 0; i < ts; i++)
+		for (int j = 0; j < ts; j++) {
+			A[i*ts + j] = Alin[i*N + j];
+		}
+}
+
+void scatter_block(const int N, const int ts, double *A, double *Alin)
+{
+	for (int i = 0; i < ts; i++)
+		for (int j = 0; j < ts; j++) {
+			Alin[i*N + j] = A[i*ts + j];
+		}
+}
+
+void convert_to_blocks(const int ts, const int DIM, const int N, double Alin[N][N], double *A[DIM][DIM])
+{
+#ifdef VERBOSE
+	printf("> Converting linear matrix to blocks...\n");
+#endif
+   for (int i = 0; i < DIM; i++)
+      for (int j = 0; j < DIM; j++) {
+         gather_block ( N, ts, &Alin[i*ts][j*ts], A[i][j]);
+      }
+}
+
+void convert_to_linear(const int ts, const int DIM, const int N, double *A[DIM][DIM], double Alin[N][N])
+{
+#ifdef VERBOSE
+	printf("> Converting blocked matrix to linear...\n");
+#endif
+   for (int i = 0; i < DIM; i++)
+      for (int j = 0; j < DIM; j++) {
+         scatter_block ( N, ts, A[i][j], (double *) &Alin[i*ts][j*ts]);
+      }
+}
+
+int main(int argc, char* argv[])
+{
+   char *result[3] = {"n/a","pass","FAIL"};
+	const double eps = pow(2.0, -53);
+
+   // If number of arguments is not 4, print help
+   if ( argc != 4) {
+      printf( "%s: matrix_size block_size check[0|1]?\n", argv[0] );
+      exit( -1 );
+   }
+
+   const int  n = atoi(argv[1]); // matrix size
+   const int ts = atoi(argv[2]); // tile size
+   int check    = atoi(argv[3]); // check result?
+
+   // Compute number of tiles
+   const int nt = n / ts;
+   assert((nt*ts) == n); // tile size should divide size
+
+   // Allocate matrix
+   double * const matrix = (double *) malloc(n * n * sizeof(double));
+   assert(matrix != NULL);
+
+   // Initialize matrix
+   initialize_matrix(n, ts, matrix);
+
+   // Allocate original matrix, and duplicate it, for debugging purposes
+   double * const original_matrix = (double *) malloc(n * n * sizeof(double));
+   assert(original_matrix != NULL);
+
+   // Save a copy of the original matrix
+   for (int i = 0; i < n * n; i++ ) {
+      original_matrix[i] = matrix[i];
+   }
+
+   // Set version description: Indirect blocked matrix
+   const char *version = "I-Blocked matrix";
+
+   // Allocate blocked matrix
+   double *Ah[nt][nt];
+   for (int i = 0; i < nt; i++) {
+      for (int j = 0; j < nt; j++) {
+         Ah[i][j] = malloc(ts * ts * sizeof(double));
+         assert(Ah[i][j] != NULL);
+      }
+   }
+
+   // ---------------------------------------
+   // Convert, compute (time), and re-convert
+   // ---------------------------------------
+   convert_to_blocks(ts, nt, n, (double(*)[n]) matrix, Ah);
+   const float tref = get_time();
+   cholesky(ts, nt, (double* (*)[nt]) Ah);
+   const float time = get_time() - tref;
+   convert_to_linear(ts, nt, n, Ah, (double (*)[n]) matrix);
+
+   // Free blocked matrix
+   for (int i = 0; i < nt; i++) {
+      for (int j = 0; j < nt; j++) {
+         assert(Ah[i][j] != NULL);
+         free(Ah[i][j]);
+      }
+   }
+
+   // Check result, if requested
+   if ( check ) {
+      const char uplo = 'L';
+      if ( check_factorization( n, original_matrix, matrix, n, uplo, eps) ) check++;
+   }
+
+   // Free original matrix, not needed anymore
+   free(original_matrix);
+
+   // Compute GFLOPs (Not verified)
+   float gflops = (((1.0 / 3.0) * n * n * n) / ((time) * 1.0e+9));
+
+   // Print results
+#ifdef VERBOSE
+   printf( "\n" );
+   printf( "============ CHOLESKY RESULTS ============\n" );
+   printf( "  test                  %s\n", argv[0]);
+   printf( "  version               %s\n", version);
+   printf( "  matrix size:          %dx%d\n", n, n);
+   printf( "  tile size:            %dx%d\n", ts, ts);
+   printf( "  number of threads:    %d\n", omp_get_max_threads());
+   printf( "  time (s):             %f\n", time);
+   printf( "  performance (gflops): %f\n", gflops);
+   printf( "  check:                %s\n", result[check]);
+   printf( "==========================================\n" );
+#else
+   printf("test, %s, version, %s, n, %d, ts, %d, num_threads, %d, gflops, %f, time, %f, check, %s\n", argv[0], version, n, ts, omp_get_max_threads(), gflops, time, result[check]);
+#endif
+
+   // Free matrix
+   free(matrix);
+
+   return 0;
+}
+
--- a/00-worksharing/cholesky-for/solution/cholesky.c
+++ b/00-worksharing/cholesky-for/solution/cholesky.c
@ -0,0 +1,291 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <assert.h>
+
+#include <sys/time.h>
+#include <sys/times.h>
+
+#include <math.h>
+#include <mkl.h>
+
+#include "omp.h"
+
+#if !defined(_OPENMP)
+    int omp_get_max_threads() { return 1; }
+    int omp_get_num_threads() { return 1; }
+#endif
+
+void cholesky(int ts, int nt, double* Ah[nt][nt])
+{
+#ifdef VERBOSE
+	printf("> Computing Cholesky Factorization: indirect blocked matrix...\n");
+#endif
+
+   for (int k = 0; k < nt; k++) {
+      // Diagonal Block factorization: using LAPACK
+      LAPACKE_dpotrf(LAPACK_COL_MAJOR, 'L', ts, Ah[k][k], ts);
+
+      // Triangular systems
+      #pragma omp parallel for
+      for (int i = k + 1; i < nt; i++) {
+         cblas_dtrsm(CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, ts, ts, 1.0, Ah[k][k], ts, Ah[k][i], ts);
+      }
+
+      // Update trailing matrix
+      #pragma omp parallel for
+      for (int i = k + 1; i < nt; i++) {
+         for (int j = k + 1; j < i; j++) {
+            cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, ts, ts, ts, -1.0, Ah[k][i], ts, Ah[k][j], ts, 1.0, Ah[j][i], ts);
+         }
+         cblas_dsyrk(CblasColMajor, CblasLower, CblasNoTrans, ts, ts, -1.0, Ah[k][i], ts, 1.0, Ah[i][i], ts);
+      }
+   }
+
+#ifdef VERBOSE
+	printf("> ...end of Cholesky Factorization.\n");
+#endif
+}
+
+float get_time()
+{
+	static double gtod_ref_time_sec = 0.0;
+
+	struct timeval tv;
+	gettimeofday(&tv, NULL);
+
+	// If this is the first invocation of through dclock(), then initialize the
+	// "reference time" global variable to the seconds field of the tv struct.
+	if (gtod_ref_time_sec == 0.0) gtod_ref_time_sec = (double) tv.tv_sec;
+
+	// Normalize the seconds field of the tv struct so that it is relative to the
+	// "reference time" that was recorded during the first invocation of dclock().
+	const double norm_sec = (double) tv.tv_sec - gtod_ref_time_sec;
+
+	// Compute the number of seconds since the reference time.
+	const double t = norm_sec + tv.tv_usec * 1.0e-6;
+
+	return (float) t;
+}
+
+// Robust Check the factorization of the matrix A2
+// Using directly Fortran services: dlacpy_, dtrmm, dlange_
+static int check_factorization(int N, double *A1, double *A2, int LDA, char uplo, double eps)
+{
+#ifdef VERBOSE
+	printf("> Checking the Cholesky Factorization... \n");
+#endif
+	char NORM = 'I', ALL = 'A', UP = 'U', LO = 'L', TR = 'T', NU = 'N', RI = 'R';
+
+	double *Residual = (double *) malloc(N*N*sizeof(double));
+	double *L1       = (double *) malloc(N*N*sizeof(double));
+	double *L2       = (double *) malloc(N*N*sizeof(double));
+	double *work     = (double *) malloc(N*sizeof(double));
+
+	memset((void*)L1, 0, N*N*sizeof(double));
+	memset((void*)L2, 0, N*N*sizeof(double));
+
+	double alpha= 1.0;
+
+	dlacpy_(&ALL, &N, &N, A1, &LDA, Residual, &N);
+
+	/* Dealing with L'L or U'U  */
+	if (uplo == 'U'){
+		dlacpy_(&UP, &N, &N, A2, &LDA, L1, &N);
+		dlacpy_(&UP, &N, &N, A2, &LDA, L2, &N);
+		dtrmm(&LO, &uplo, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
+	} else{
+		dlacpy_(&LO, &N, &N, A2, &LDA, L1, &N);
+		dlacpy_(&LO, &N, &N, A2, &LDA, L2, &N);
+		dtrmm(&RI, &LO, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
+	}
+
+	/* Compute the Residual || A -L'L|| */
+	for (int i = 0; i < N; i++)
+		for (int j = 0; j < N; j++)
+			Residual[j*N+i] = L2[j*N+i] - Residual[j*N+i];
+
+	double Rnorm = dlange_(&NORM, &N, &N, Residual, &N, work);
+	double Anorm = dlange_(&NORM, &N, &N, A1, &N, work);
+
+#ifdef VERBOSE
+	printf("> - ||L'L-A||_oo/(||A||_oo.N.eps) = %e \n",Rnorm / (Anorm*N*eps));
+#endif
+
+	const int info_factorization = isnan(Rnorm/(Anorm*N*eps)) || isinf(Rnorm/(Anorm*N*eps)) || (Rnorm/(Anorm*N*eps) > 60.0);
+
+#ifdef VERBOSE
+	if ( info_factorization) printf("> - Factorization is suspicious!\n");
+	else printf("> - Factorization is CORRECT!\n");
+#endif
+
+	free(Residual); free(L1); free(L2); free(work);
+
+	return info_factorization;
+}
+
+void initialize_matrix(const int n, const int ts, double *matrix)
+{
+#ifdef VERBOSE
+	printf("> Initializing matrix with random values...\n");
+#endif
+
+	int ISEED[4] = {0,0,0,1};
+	int intONE=1;
+
+	for (int i = 0; i < n*n; i+=n) {
+		dlarnv_(&intONE, &ISEED[0], &n, &matrix[i]);
+	}
+
+	for (int i=0; i<n; i++) {
+		for (int j=0; j<n; j++) {
+			matrix[j*n+i] = matrix[j*n+i] + matrix[i*n+j];
+			matrix[i*n+j] = matrix[j*n+i];
+		}
+	}
+
+   // Diagonal values
+	for (int i = 0; i < n; i++) {
+		matrix[i*n+i] += (double) n;
+   }
+}
+
+void gather_block(const int N, const int ts, double *Alin, double *A)
+{
+	for (int i = 0; i < ts; i++)
+		for (int j = 0; j < ts; j++) {
+			A[i*ts + j] = Alin[i*N + j];
+		}
+}
+
+void scatter_block(const int N, const int ts, double *A, double *Alin)
+{
+	for (int i = 0; i < ts; i++)
+		for (int j = 0; j < ts; j++) {
+			Alin[i*N + j] = A[i*ts + j];
+		}
+}
+
+void convert_to_blocks(const int ts, const int DIM, const int N, double Alin[N][N], double *A[DIM][DIM])
+{
+#ifdef VERBOSE
+	printf("> Converting linear matrix to blocks...\n");
+#endif
+   for (int i = 0; i < DIM; i++)
+      for (int j = 0; j < DIM; j++) {
+         gather_block ( N, ts, &Alin[i*ts][j*ts], A[i][j]);
+      }
+}
+
+void convert_to_linear(const int ts, const int DIM, const int N, double *A[DIM][DIM], double Alin[N][N])
+{
+#ifdef VERBOSE
+	printf("> Converting blocked matrix to linear...\n");
+#endif
+   for (int i = 0; i < DIM; i++)
+      for (int j = 0; j < DIM; j++) {
+         scatter_block ( N, ts, A[i][j], (double *) &Alin[i*ts][j*ts]);
+      }
+}
+
+int main(int argc, char* argv[])
+{
+   char *result[3] = {"n/a","pass","FAIL"};
+	const double eps = pow(2.0, -53);
+
+   // If number of arguments is not 4, print help
+   if ( argc != 4) {
+      printf( "%s: matrix_size block_size check[0|1]?\n", argv[0] );
+      exit( -1 );
+   }
+
+   const int  n = atoi(argv[1]); // matrix size
+   const int ts = atoi(argv[2]); // tile size
+   int check    = atoi(argv[3]); // check result?
+
+   // Compute number of tiles
+   const int nt = n / ts;
+   assert((nt*ts) == n); // tile size should divide size
+
+   // Allocate matrix
+   double * const matrix = (double *) malloc(n * n * sizeof(double));
+   assert(matrix != NULL);
+
+   // Initialize matrix
+   initialize_matrix(n, ts, matrix);
+
+   // Allocate original matrix, and duplicate it, for debugging purposes
+   double * const original_matrix = (double *) malloc(n * n * sizeof(double));
+   assert(original_matrix != NULL);
+
+   // Save a copy of the original matrix
+   for (int i = 0; i < n * n; i++ ) {
+      original_matrix[i] = matrix[i];
+   }
+
+   // Set version description: Indirect blocked matrix
+   const char *version = "I-Blocked matrix";
+
+   // Allocate blocked matrix
+   double *Ah[nt][nt];
+   for (int i = 0; i < nt; i++) {
+      for (int j = 0; j < nt; j++) {
+         Ah[i][j] = malloc(ts * ts * sizeof(double));
+         assert(Ah[i][j] != NULL);
+      }
+   }
+
+   // ---------------------------------------
+   // Convert, compute (time), and re-convert
+   // ---------------------------------------
+   convert_to_blocks(ts, nt, n, (double(*)[n]) matrix, Ah);
+   const float tref = get_time();
+   cholesky(ts, nt, (double* (*)[nt]) Ah);
+   const float time = get_time() - tref;
+   convert_to_linear(ts, nt, n, Ah, (double (*)[n]) matrix);
+
+   // Free blocked matrix
+   for (int i = 0; i < nt; i++) {
+      for (int j = 0; j < nt; j++) {
+         assert(Ah[i][j] != NULL);
+         free(Ah[i][j]);
+      }
+   }
+
+   // Check result, if requested
+   if ( check ) {
+      const char uplo = 'L';
+      if ( check_factorization( n, original_matrix, matrix, n, uplo, eps) ) check++;
+   }
+
+   // Free original matrix, not needed anymore
+   free(original_matrix);
+
+   // Compute GFLOPs (Not verified)
+   float gflops = (((1.0 / 3.0) * n * n * n) / ((time) * 1.0e+9));
+
+   // Print results
+#ifdef VERBOSE
+   printf( "\n" );
+   printf( "============ CHOLESKY RESULTS ============\n" );
+   printf( "  test                  %s\n", argv[0]);
+   printf( "  version               %s\n", version);
+   printf( "  matrix size:          %dx%d\n", n, n);
+   printf( "  tile size:            %dx%d\n", ts, ts);
+   printf( "  number of threads:    %d\n", omp_get_max_threads());
+   printf( "  time (s):             %f\n", time);
+   printf( "  performance (gflops): %f\n", gflops);
+   printf( "  check:                %s\n", result[check]);
+   printf( "==========================================\n" );
+#else
+   printf("test, %s, version, %s, n, %d, ts, %d, num_threads, %d, gflops, %f, time, %f, check, %s\n", argv[0], version, n, ts, omp_get_max_threads(), gflops, time, result[check]);
+#endif
+
+   // Free matrix
+   free(matrix);
+
+   return 0;
+}
+
--- a/00-worksharing/matmul/Makefile
+++ b/00-worksharing/matmul/Makefile
@ -0,0 +1,23 @@
+BIN_DIR=../bin
+PROGRAM=$(BIN_DIR)/matmul
+
+MKL_HOME=$(MKLROOT)
+
+CC=gcc
+CFLAGS=-O3 -std=gnu99 -fopenmp
+LDFLAGS=-fopenmp
+#VERBOSE=-DVERBOSE
+
+all: $(PROGRAM) 
+
+$(PROGRAM): matmul.c 
+	$(CC) $(CFLAGS) $(VERBOSE) -o $@ $^ $(LDFLAGS)
+
+$(BIN_DIR): 
+	mkdir $@
+
+clean:
+	rm -rf $(PROGRAM) *.o
+
+wipe: clean
+	rm -rf *.out *.err
--- a/00-worksharing/matmul/matmul.c
+++ b/00-worksharing/matmul/matmul.c
@ -0,0 +1,106 @@
+//===-- matmul.c - Different implementations of matrix multiplies -*- C -*-===//
+//
+// Part of the LOMP Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <omp.h>
+
+#define DUMP_MATRIX 0
+
+void matmul_seq(double * C, double * A, double * B, size_t n) {
+  for (size_t i = 0; i < n; ++i) {
+    for (size_t k = 0; k < n; ++k) {
+      for (size_t j = 0; j < n; ++j) {
+        C[i * n + j] += A[i * n + k] * B[k * n + j];
+      }
+    }
+  }
+}
+
+void matmul_par(double * C, double * A, double * B, size_t n) {
+  for (size_t i = 0; i < n; ++i) {
+    for (size_t k = 0; k < n; ++k) {
+      for (size_t j = 0; j < n; ++j) {
+        C[i * n + j] += A[i * n + k] * B[k * n + j];
+      }
+    }
+  }
+}
+
+void init_mat(double * C, double * A, double * B, size_t n) {
+  for (size_t i = 0; i < n; ++i) {
+    for (size_t j = 0; j < n; ++j) {
+      C[i * n + j] = 0.0;
+      A[i * n + j] = 0.5;
+      B[i * n + j] = 0.25;
+    }
+  }
+}
+
+void dump_mat(double * mtx, size_t n) {
+  for (size_t i = 0; i < n; ++i) {
+    for (size_t j = 0; j < n; ++j) {
+      printf("%f ", mtx[i * n + j]);
+    }
+    printf("\n");
+  }
+}
+
+double sum_mat(double * mtx, size_t n) {
+  double sum = 0.0;
+  for (size_t i = 0; i < n; ++i) {
+    for (size_t j = 0; j < n; ++j) {
+      sum += mtx[i * n + j];
+    }
+  }
+  return sum;
+}
+
+int main(int argc, char * argv[]) {
+  double ts, te;
+  double t_seq;
+
+  double * C;
+  double * A;
+  double * B;
+
+  // If number of arguments is not 1, print help
+  if (argc != 2) {
+    printf("%s: matrix_size\n", argv[0]);
+    return EXIT_FAILURE;
+  } 
+  const int  n = atoi(argv[1]); // matrix size
+
+  C = (double *)malloc(sizeof(*C) * n * n);
+  A = (double *)malloc(sizeof(*A) * n * n);
+  B = (double *)malloc(sizeof(*B) * n * n);
+
+  init_mat(C, A, B, n);
+  ts = omp_get_wtime();
+  matmul_seq(C, A, B, n);
+  te = omp_get_wtime();
+#if DUMP_MATRIX
+  dump_mat(C, n);
+#endif
+  t_seq = te - ts;
+  printf("Sum of matrix (serial):   %f, wall time %lf, speed-up %.2lf\n",
+         sum_mat(C, n), (te - ts), t_seq / (te - ts));
+
+  init_mat(C, A, B, n);
+  ts = omp_get_wtime();
+  matmul_par(C, A, B, n);
+  te = omp_get_wtime();
+#if DUMP_MATRIX
+  dump_mat(C, n);
+#endif
+  printf("Sum of matrix (parallel): %f, wall time %lf, speed-up %.2lf\n",
+         sum_mat(C, n), (te - ts), t_seq / (te - ts));
+
+  return EXIT_SUCCESS;
+}
--- a/00-worksharing/matmul/solution/matmul.c
+++ b/00-worksharing/matmul/solution/matmul.c
@ -0,0 +1,108 @@
+//===-- matmul.c - Different implementations of matrix multiplies -*- C -*-===//
+//
+// Part of the LOMP Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <omp.h>
+
+#define DUMP_MATRIX 0
+
+void matmul_seq(double * C, double * A, double * B, size_t n) {
+  for (size_t i = 0; i < n; ++i) {
+    for (size_t k = 0; k < n; ++k) {
+      for (size_t j = 0; j < n; ++j) {
+        C[i * n + j] += A[i * n + k] * B[k * n + j];
+      }
+    }
+  }
+}
+
+void matmul_par(double * C, double * A, double * B, size_t n) {
+#pragma omp parallel for shared(A,B,C) firstprivate(n) \
+                         schedule(static) // collapse(2)
+  for (size_t i = 0; i < n; ++i) {
+    for (size_t k = 0; k < n; ++k) {
+      for (size_t j = 0; j < n; ++j) {
+        C[i * n + j] += A[i * n + k] * B[k * n + j];
+      }
+    }
+  }
+}
+
+void init_mat(double * C, double * A, double * B, size_t n) {
+  for (size_t i = 0; i < n; ++i) {
+    for (size_t j = 0; j < n; ++j) {
+      C[i * n + j] = 0.0;
+      A[i * n + j] = 0.5;
+      B[i * n + j] = 0.25;
+    }
+  }
+}
+
+void dump_mat(double * mtx, size_t n) {
+  for (size_t i = 0; i < n; ++i) {
+    for (size_t j = 0; j < n; ++j) {
+      printf("%f ", mtx[i * n + j]);
+    }
+    printf("\n");
+  }
+}
+
+double sum_mat(double * mtx, size_t n) {
+  double sum = 0.0;
+  for (size_t i = 0; i < n; ++i) {
+    for (size_t j = 0; j < n; ++j) {
+      sum += mtx[i * n + j];
+    }
+  }
+  return sum;
+}
+
+int main(int argc, char * argv[]) {
+  double ts, te;
+  double t_seq;
+
+  double * C;
+  double * A;
+  double * B;
+
+  // If number of arguments is not 1, print help
+  if (argc != 2) {
+    printf("%s: matrix_size\n", argv[0]);
+    return EXIT_FAILURE;
+  } 
+  const int  n = atoi(argv[1]); // matrix size
+
+  C = (double *)malloc(sizeof(*C) * n * n);
+  A = (double *)malloc(sizeof(*A) * n * n);
+  B = (double *)malloc(sizeof(*B) * n * n);
+
+  init_mat(C, A, B, n);
+  ts = omp_get_wtime();
+  matmul_seq(C, A, B, n);
+  te = omp_get_wtime();
+#if DUMP_MATRIX
+  dump_mat(C, n);
+#endif
+  t_seq = te - ts;
+  printf("Sum of matrix (serial):   %f, wall time %lf, speed-up %.2lf\n",
+         sum_mat(C, n), (te - ts), t_seq / (te - ts));
+
+  init_mat(C, A, B, n);
+  ts = omp_get_wtime();
+  matmul_par(C, A, B, n);
+  te = omp_get_wtime();
+#if DUMP_MATRIX
+  dump_mat(C, n);
+#endif
+  printf("Sum of matrix (parallel): %f, wall time %lf, speed-up %.2lf\n",
+         sum_mat(C, n), (te - ts), t_seq / (te - ts));
+
+  return EXIT_SUCCESS;
+}
--- a/00-worksharing/sin-cos/Makefile
+++ b/00-worksharing/sin-cos/Makefile
@ -0,0 +1,21 @@
+BIN_DIR=../bin
+PROGRAM=$(BIN_DIR)/sin-cos
+
+CC=gcc
+CFLAGS=-O3 -std=gnu99 -fopenmp
+LDFLAGS=-lm -fopenmp
+#VERBOSE=-DVERBOSE
+
+all: $(PROGRAM) 
+
+$(PROGRAM): sin-cos.c
+	$(CC) $(CFLAGS) $(VERBOSE) -o $@ $^ $(LDFLAGS)
+
+$(BIN_DIR): 
+	mkdir $@
+
+clean:
+	rm -rf $(PROGRAM) *.o
+
+wipe: clean
+	rm -rf *.out *.err
--- a/00-worksharing/sin-cos/sin-cos.c
+++ b/00-worksharing/sin-cos/sin-cos.c
@ -0,0 +1,58 @@
+/*
+ *  OpenMP lecture exercises
+ *  Copyright (C) 2011 by Christian Terboven <terboven@rz.rwth-aachen.de>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+
+#include <omp.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+
+double do_some_computation(int i)
+{
+	double t = 0.0;
+	int j;
+	for (j = 0; j < i*i; j++)
+	{
+		t += sin((double)j) * cos((double)j);
+	}
+	return t;
+}
+
+
+int main(int argc, char* argv[])
+{
+	const int dimension = 500;
+	int i;
+	double result = 0.0;
+
+	double t1 = omp_get_wtime();
+
+	for (i = 0; i < dimension; i++)
+	{
+		result += do_some_computation(i);
+	}
+
+	double t2 = omp_get_wtime();
+	printf("Computation took %.3lf seconds.\n", t2 - t1);
+	printf("Result is %.3lf.\n", result);
+	
+	return 0;
+}
--- a/00-worksharing/sin-cos/solution/sin-cos.c
+++ b/00-worksharing/sin-cos/solution/sin-cos.c
@ -0,0 +1,59 @@
+/*
+ *  OpenMP lecture exercises
+ *  Copyright (C) 2011 by Christian Terboven <terboven@rz.rwth-aachen.de>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+
+#include <omp.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+
+double do_some_computation(int i)
+{
+	double t = 0.0;
+	int j;
+	for (j = 0; j < i*i; j++)
+	{
+		t += sin((double)j) * cos((double)j);
+	}
+	return t;
+}
+
+
+int main(int argc, char* argv[])
+{
+	const int dimension = 500;
+	int i;
+	double result = 0.0;
+
+	double t1 = omp_get_wtime();
+
+	#pragma omp parallel for schedule(dynamic) reduction(+:result)
+	for (i = 0; i < dimension; i++)
+	{
+		result += do_some_computation(i);
+	}
+
+	double t2 = omp_get_wtime();
+	printf("Computation took %.3lf seconds.\n", t2 - t1);
+	printf("Result is %.3lf.\n", result);
+	
+	return 0;
+}
--- a/01-tasks/Makefile
+++ b/01-tasks/Makefile
@ -0,0 +1,14 @@
+ALL_RECURSIVE=all-recursive clean-recursive 
+
+DIRS=$(shell ls -d *-* | sed '/_/d' 2>/dev/null)
+
+all:   $(CONFIG) all-recursive
+clean: $(CONFIG) clean-recursive
+
+$(ALL_RECURSIVE):
+	@failcom='exit 1';\
+   target=`echo $@ | sed s/-recursive//`; \
+   for subdir in $(DIRS); do \
+      (cd $$subdir && $(MAKE) $$target) || eval $$failcom; \
+   done;
+
--- a/01-tasks/bin/.gitdir
+++ b/01-tasks/bin/.gitdir
--- a/01-tasks/bin/run-cholesky.sh
+++ b/01-tasks/bin/run-cholesky.sh
@ -0,0 +1,28 @@
+#!/bin/bash
+#SBATCH --job-name=cholesky
+#SBATCH --workdir=.
+#SBATCH --output=cholesky_%j.out
+#SBATCH --error=cholesky_%j.err
+#SBATCH --cpus-per-task=48
+#SBATCH --ntasks=1
+#SBATCH --time=00:30:00
+#SBATCH --qos=debug
+
+export IFS=";"
+
+THREADS="01;02;04;06;08;10;12;14;16;18;20;22;24;26;28;30;32;34;36;38;40;42;44;46;48"
+#THREADS="01;02;04;06;08;10;12"
+
+#MSIZES="2048;4096;8192;16384"
+MSIZES="4096"
+
+BSIZES="512"
+
+for MS in $MSIZES; do
+  for BS in $BSIZES; do
+    for threads in $THREADS; do
+      OMP_NUM_THREADS=$threads ./cholesky-tsk $MS $BS 0
+    done
+  done
+done
+
--- a/01-tasks/bin/run-sudoku.sh
+++ b/01-tasks/bin/run-sudoku.sh
@ -0,0 +1,20 @@
+#!/bin/bash
+#SBATCH --job-name=sudoku
+#SBATCH --workdir=.
+#SBATCH --output=sudoku_%j.out
+#SBATCH --error=sudoku_%j.err
+#SBATCH --cpus-per-task=48
+#SBATCH --ntasks=1
+#SBATCH --time=00:30:00
+#SBATCH --qos=debug
+
+export IFS=";"
+
+THREADS="01;02;04;06;08;10;12;14;16;18;20;22;24;26;28;30;32;34;36;38;40;42;44;46;48"
+#THREADS="01;02;04;06;08;10;12"
+
+for threads in $THREADS; do
+  OMP_NUM_THREADS=$threads ./sudoku-tsk 9 3 sudoku-9x9-1.txt 
+  #OMP_NUM_THREADS=$threads ./sudoku-tsk 16 4 sudoku-16x16-1.txt 
+  #OMP_NUM_THREADS=$threads ./sudoku-tsk 25 5 sudoku-25x25-1.txt 
+done
--- a/01-tasks/bin/sudoku-16x16-1.txt
+++ b/01-tasks/bin/sudoku-16x16-1.txt
@ -0,0 +1,16 @@
+0 6 0 0 0 0 0 8 11 0 0 15 14 0 0 16
+15 11 0 0 0 16 14 0 0 0 12 0 0 6 0 0
+13 0 9 12 0 0 0 0 3 16 14 0 15 11 10 0
+2 0 16 0 11 0 15 10 1 0 0 0 0 0 0 0
+0 15 11 10 0 0 16 2 13 8 9 12 0 0 0 0
+12 13 0 0 4 1 5 6 2 3 0 0 0 0 11 10
+5 0 6 1 12 0 9 0 15 11 10 7 16 0 0 3
+0 2 0 0 0 10 0 11 6 0 5 0 0 13 0 9
+10 7 15 11 16 0 0 0 12 13 0 0 0 0 0 6
+9 0 0 0 0 0 1 0 0 2 0 16 10 0 0 11
+1 0 4 6 9 13 0 0 7 0 11 0 3 16 0 0
+16 14 0 0 7 0 10 15 4 6 1 0 0 0 13 8
+11 10 0 15 0 0 0 16 9 12 13 0 0 1 5 4
+0 0 12 0 1 4 6 0 16 0 0 0 11 10 0 0
+0 0 5 0 8 12 13 0 10 0 0 11 2 0 0 14
+3 16 0 0 10 0 0 7 0 0 6 0 0 0 12 0
--- a/01-tasks/bin/sudoku-25x25-1.txt
+++ b/01-tasks/bin/sudoku-25x25-1.txt
@ -0,0 +1,25 @@
+0 8 0 18 0 12 20 21 16 0 0 0 0 0 0 15 0 0 24 5 7 0 19 0 0
+0 0 15 9 5 4 0 23 0 18 12 20 0 21 16 7 0 0 0 2 1 0 0 0 0
+0 0 0 0 14 7 0 0 0 0 0 24 0 0 0 0 10 0 0 16 0 0 23 18 6
+21 20 12 0 16 0 0 25 14 22 7 0 0 19 2 4 0 23 8 6 0 24 0 0 0
+19 0 7 0 2 15 0 0 0 9 0 0 18 0 0 0 0 0 0 0 12 0 21 0 16
+0 23 6 0 0 16 0 0 0 20 14 25 11 0 0 5 24 7 3 9 2 19 1 0 17
+4 21 0 20 0 0 0 12 22 11 2 19 0 0 0 0 0 15 23 0 0 3 7 0 9
+12 25 14 0 22 0 19 0 0 13 0 0 24 7 0 0 0 0 21 10 6 0 15 8 18
+7 0 0 24 9 6 0 15 18 0 0 0 0 0 10 0 13 1 19 0 0 25 0 11 22
+0 19 2 13 17 5 0 0 9 0 6 0 8 15 0 14 11 12 25 22 16 0 0 0 10
+0 0 0 1 0 0 0 0 3 7 8 0 15 0 0 11 12 10 0 0 0 0 18 4 0
+10 16 11 12 25 0 14 22 19 1 0 0 0 17 3 20 0 0 6 21 8 0 0 0 0
+0 0 8 0 23 20 0 18 21 4 11 16 0 0 25 0 7 0 2 0 0 14 22 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 22 19 8 15 0 5 23 24 0 17 0 3
+0 2 0 7 3 8 0 9 23 15 20 0 4 0 21 13 1 0 14 19 0 0 10 12 25
+2 0 0 0 0 18 15 5 0 23 0 0 21 6 20 17 0 14 1 13 0 12 0 25 11
+0 1 0 19 13 9 7 0 24 0 0 0 23 0 8 0 25 0 0 11 0 0 0 21 0
+0 0 10 0 0 0 0 0 0 0 17 1 0 14 0 18 0 5 15 8 9 0 0 0 0
+16 12 0 0 11 0 0 14 0 0 0 0 3 2 24 0 21 0 4 20 0 15 0 0 8
+0 0 0 0 0 10 0 0 0 0 0 12 0 0 11 9 3 0 0 0 17 0 14  19 13
+24 0 0 0 15 21 18 0 4 0 0 0 16 20 0 3 2 13 0 7 19 22 0 0 1
+0 18 0 0 0 0 10 0 0 16 0 22 0 0 0 23 0 24 9 0 3 17 0 0 0
+0 0 0 0 0 0 0 0 15 5 0 0 6 8 4 0 14 11 22 1 25 0 20 16 12
+0 22 19 14 1 3 0 0 0 0 23 9 5 0 15 25 0 20 10 12 21 0 8 0 4
+20 0 25 16 0 0 22 11 1 0 3 0 2 13 7 21 0 0 0 0 0 9 0 0 0
--- a/01-tasks/bin/sudoku-9x9-1.txt
+++ b/01-tasks/bin/sudoku-9x9-1.txt
@ -0,0 +1,10 @@
+5 3 0 0 7 0 0 0 0
+6 0 0 1 9 5 3 0 0
+0 9 8 0 0 0 0 6 0
+8 0 0 0 6 0 0 0 3
+4 0 0 8 0 3 0 0 1
+7 0 0 0 2 0 0 0 6
+0 6 0 0 0 0 2 8 0
+0 0 0 4 1 9 0 0 5
+0 0 0 0 8 0 0 7 9
+
--- a/01-tasks/cholesky-tsk/Makefile
+++ b/01-tasks/cholesky-tsk/Makefile
@ -0,0 +1,23 @@
+BIN_DIR=../bin
+PROGRAM=$(BIN_DIR)/cholesky-tsk
+
+MKL_HOME=$(MKLROOT)
+
+CC=gcc
+CFLAGS=-O3 -std=gnu99 -I$(MKL_HOME)/include -fopenmp
+LDFLAGS=-L$(MKL_HOME)/lib/intel64 -lmkl_sequential -lmkl_core -lmkl_rt -lpthread -lm -fopenmp
+VERBOSE=-DVERBOSE
+
+all: $(PROGRAM) 
+
+$(PROGRAM): cholesky.c 
+	$(CC) $(CFLAGS) $(VERBOSE) -o $@ $^ $(LDFLAGS)
+
+$(BIN_DIR): 
+	mkdir $@
+
+clean:
+	rm -rf $(PROGRAM) *.o
+
+wipe: clean
+	rm -rf *.out *.err
--- a/01-tasks/cholesky-tsk/cholesky-solved.c
+++ b/01-tasks/cholesky-tsk/cholesky-solved.c
@ -0,0 +1,296 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <assert.h>
+
+#include <sys/time.h>
+#include <sys/times.h>
+
+#include <math.h>
+#include <mkl.h>
+
+#include "omp.h"
+
+#if !defined(_OPENMP)
+    int omp_get_max_threads() { return 1; }
+    int omp_get_num_threads() { return 1; }
+#endif
+
+void cholesky(int ts, int nt, double* Ah[nt][nt])
+{
+#ifdef VERBOSE
+	printf("> Computing Cholesky Factorization: indirect blocked matrix...\n");
+#endif
+
+   #pragma omp parallel
+   #pragma omp single
+   for (int k = 0; k < nt; k++) {
+      // Diagonal Block factorization: using LAPACK
+      LAPACKE_dpotrf(LAPACK_COL_MAJOR, 'L', ts, Ah[k][k], ts);
+
+      // Triangular systems
+      for (int i = k + 1; i < nt; i++) {
+         #pragma omp task
+         cblas_dtrsm(CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, ts, ts, 1.0, Ah[k][k], ts, Ah[k][i], ts);
+      }
+      #pragma omp taskwait
+
+      // Update trailing matrix
+      for (int i = k + 1; i < nt; i++) {
+         for (int j = k + 1; j < i; j++) {
+            #pragma omp task
+            cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, ts, ts, ts, -1.0, Ah[k][i], ts, Ah[k][j], ts, 1.0, Ah[j][i], ts);
+         }
+         #pragma omp task
+         cblas_dsyrk(CblasColMajor, CblasLower, CblasNoTrans, ts, ts, -1.0, Ah[k][i], ts, 1.0, Ah[i][i], ts);
+      }
+      #pragma omp taskwait
+   }
+
+#ifdef VERBOSE
+	printf("> ...end of Cholesky Factorization.\n");
+#endif
+}
+
+float get_time()
+{
+	static double gtod_ref_time_sec = 0.0;
+
+	struct timeval tv;
+	gettimeofday(&tv, NULL);
+
+	// If this is the first invocation of through dclock(), then initialize the
+	// "reference time" global variable to the seconds field of the tv struct.
+	if (gtod_ref_time_sec == 0.0) gtod_ref_time_sec = (double) tv.tv_sec;
+
+	// Normalize the seconds field of the tv struct so that it is relative to the
+	// "reference time" that was recorded during the first invocation of dclock().
+	const double norm_sec = (double) tv.tv_sec - gtod_ref_time_sec;
+
+	// Compute the number of seconds since the reference time.
+	const double t = norm_sec + tv.tv_usec * 1.0e-6;
+
+	return (float) t;
+}
+
+// Robust Check the factorization of the matrix A2
+// Using directly Fortran services: dlacpy_, dtrmm, dlange_
+static int check_factorization(int N, double *A1, double *A2, int LDA, char uplo, double eps)
+{
+#ifdef VERBOSE
+	printf("> Checking the Cholesky Factorization... \n");
+#endif
+	char NORM = 'I', ALL = 'A', UP = 'U', LO = 'L', TR = 'T', NU = 'N', RI = 'R';
+
+	double *Residual = (double *) malloc(N*N*sizeof(double));
+	double *L1       = (double *) malloc(N*N*sizeof(double));
+	double *L2       = (double *) malloc(N*N*sizeof(double));
+	double *work     = (double *) malloc(N*sizeof(double));
+
+	memset((void*)L1, 0, N*N*sizeof(double));
+	memset((void*)L2, 0, N*N*sizeof(double));
+
+	double alpha= 1.0;
+
+	dlacpy_(&ALL, &N, &N, A1, &LDA, Residual, &N);
+
+	/* Dealing with L'L or U'U  */
+	if (uplo == 'U'){
+		dlacpy_(&UP, &N, &N, A2, &LDA, L1, &N);
+		dlacpy_(&UP, &N, &N, A2, &LDA, L2, &N);
+		dtrmm(&LO, &uplo, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
+	} else{
+		dlacpy_(&LO, &N, &N, A2, &LDA, L1, &N);
+		dlacpy_(&LO, &N, &N, A2, &LDA, L2, &N);
+		dtrmm(&RI, &LO, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
+	}
+
+	/* Compute the Residual || A -L'L|| */
+	for (int i = 0; i < N; i++)
+		for (int j = 0; j < N; j++)
+			Residual[j*N+i] = L2[j*N+i] - Residual[j*N+i];
+
+	double Rnorm = dlange_(&NORM, &N, &N, Residual, &N, work);
+	double Anorm = dlange_(&NORM, &N, &N, A1, &N, work);
+
+#ifdef VERBOSE
+	printf("> - ||L'L-A||_oo/(||A||_oo.N.eps) = %e \n",Rnorm / (Anorm*N*eps));
+#endif
+
+	const int info_factorization = isnan(Rnorm/(Anorm*N*eps)) || isinf(Rnorm/(Anorm*N*eps)) || (Rnorm/(Anorm*N*eps) > 60.0);
+
+#ifdef VERBOSE
+	if ( info_factorization) printf("> - Factorization is suspicious!\n");
+	else printf("> - Factorization is CORRECT!\n");
+#endif
+
+	free(Residual); free(L1); free(L2); free(work);
+
+	return info_factorization;
+}
+
+void initialize_matrix(const int n, const int ts, double *matrix)
+{
+#ifdef VERBOSE
+	printf("> Initializing matrix with random values...\n");
+#endif
+
+	int ISEED[4] = {0,0,0,1};
+	int intONE=1;
+
+	for (int i = 0; i < n*n; i+=n) {
+		dlarnv_(&intONE, &ISEED[0], &n, &matrix[i]);
+	}
+
+	for (int i=0; i<n; i++) {
+		for (int j=0; j<n; j++) {
+			matrix[j*n+i] = matrix[j*n+i] + matrix[i*n+j];
+			matrix[i*n+j] = matrix[j*n+i];
+		}
+	}
+
+   // Diagonal values
+	for (int i = 0; i < n; i++) {
+		matrix[i*n+i] += (double) n;
+   }
+}
+
+void gather_block(const int N, const int ts, double *Alin, double *A)
+{
+	for (int i = 0; i < ts; i++)
+		for (int j = 0; j < ts; j++) {
+			A[i*ts + j] = Alin[i*N + j];
+		}
+}
+
+void scatter_block(const int N, const int ts, double *A, double *Alin)
+{
+	for (int i = 0; i < ts; i++)
+		for (int j = 0; j < ts; j++) {
+			Alin[i*N + j] = A[i*ts + j];
+		}
+}
+
+void convert_to_blocks(const int ts, const int DIM, const int N, double Alin[N][N], double *A[DIM][DIM])
+{
+#ifdef VERBOSE
+	printf("> Converting linear matrix to blocks...\n");
+#endif
+   for (int i = 0; i < DIM; i++)
+      for (int j = 0; j < DIM; j++) {
+         gather_block ( N, ts, &Alin[i*ts][j*ts], A[i][j]);
+      }
+}
+
+void convert_to_linear(const int ts, const int DIM, const int N, double *A[DIM][DIM], double Alin[N][N])
+{
+#ifdef VERBOSE
+	printf("> Converting blocked matrix to linear...\n");
+#endif
+   for (int i = 0; i < DIM; i++)
+      for (int j = 0; j < DIM; j++) {
+         scatter_block ( N, ts, A[i][j], (double *) &Alin[i*ts][j*ts]);
+      }
+}
+
+int main(int argc, char* argv[])
+{
+   char *result[3] = {"n/a","pass","FAIL"};
+	const double eps = pow(2.0, -53);
+
+   // If number of arguments is not 4, print help
+   if ( argc != 4) {
+      printf( "%s: matrix_size block_size check[0|1]?\n", argv[0] );
+      exit( -1 );
+   }
+
+   const int  n = atoi(argv[1]); // matrix size
+   const int ts = atoi(argv[2]); // tile size
+   int check    = atoi(argv[3]); // check result?
+
+   // Compute number of tiles
+   const int nt = n / ts;
+   assert((nt*ts) == n); // tile size should divide size
+
+   // Allocate matrix
+   double * const matrix = (double *) malloc(n * n * sizeof(double));
+   assert(matrix != NULL);
+
+   // Initialize matrix
+   initialize_matrix(n, ts, matrix);
+
+   // Allocate original matrix, and duplicate it, for debugging purposes
+   double * const original_matrix = (double *) malloc(n * n * sizeof(double));
+   assert(original_matrix != NULL);
+
+   // Save a copy of the original matrix
+   for (int i = 0; i < n * n; i++ ) {
+      original_matrix[i] = matrix[i];
+   }
+
+   // Set version description: Indirect blocked matrix
+   const char *version = "I-Blocked matrix";
+
+   // Allocate blocked matrix
+   double *Ah[nt][nt];
+   for (int i = 0; i < nt; i++) {
+      for (int j = 0; j < nt; j++) {
+         Ah[i][j] = malloc(ts * ts * sizeof(double));
+         assert(Ah[i][j] != NULL);
+      }
+   }
+
+   // ---------------------------------------
+   // Convert, compute (time), and re-convert
+   // ---------------------------------------
+   convert_to_blocks(ts, nt, n, (double(*)[n]) matrix, Ah);
+   const float tref = get_time();
+   cholesky(ts, nt, (double* (*)[nt]) Ah);
+   const float time = get_time() - tref;
+   convert_to_linear(ts, nt, n, Ah, (double (*)[n]) matrix);
+
+   // Free blocked matrix
+   for (int i = 0; i < nt; i++) {
+      for (int j = 0; j < nt; j++) {
+         assert(Ah[i][j] != NULL);
+         free(Ah[i][j]);
+      }
+   }
+
+   // Check result, if requested
+   if ( check ) {
+      const char uplo = 'L';
+      if ( check_factorization( n, original_matrix, matrix, n, uplo, eps) ) check++;
+   }
+
+   // Free original matrix, not needed anymore
+   free(original_matrix);
+
+   // Compute GFLOPs (Not verified)
+   float gflops = (((1.0 / 3.0) * n * n * n) / ((time) * 1.0e+9));
+
+   // Print results
+#ifdef VERBOSE
+   printf( "\n" );
+   printf( "============ CHOLESKY RESULTS ============\n" );
+   printf( "  test                  %s\n", argv[0]);
+   printf( "  version               %s\n", version);
+   printf( "  matrix size:          %dx%d\n", n, n);
+   printf( "  tile size:            %dx%d\n", ts, ts);
+   printf( "  number of threads:    %d\n", omp_get_max_threads());
+   printf( "  time (s):             %f\n", time);
+   printf( "  performance (gflops): %f\n", gflops);
+   printf( "  check:                %s\n", result[check]);
+   printf( "==========================================\n" );
+#else
+   printf("test, %s, version, %s, n, %d, ts, %d, num_threads, %d, gflops, %f, time, %f, check, %s\n", argv[0], version, n, ts, omp_get_max_threads(), gflops, time, result[check]);
+#endif
+
+   // Free matrix
+   free(matrix);
+
+   return 0;
+}
+
--- a/01-tasks/cholesky-tsk/cholesky.c
+++ b/01-tasks/cholesky-tsk/cholesky.c
@ -0,0 +1,292 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <assert.h>
+
+#include <sys/time.h>
+#include <sys/times.h>
+
+#include <math.h>
+#include <mkl.h>
+
+#include "omp.h"
+
+#if !defined(_OPENMP)
+    int omp_get_max_threads() { return 1; }
+    int omp_get_num_threads() { return 1; }
+#endif
+
+void cholesky(int ts, int nt, double* Ah[nt][nt])
+{
+#ifdef VERBOSE
+	printf("> Computing Cholesky Factorization: indirect blocked matrix...\n");
+#endif
+
+   // TODO: Parallel region is required in order to run in parallel.
+   // TODO: Kernel invocations could be task candidates.
+   // TODO: Add scheduler restrictions: taskwait, taskgroup, etc.
+   for (int k = 0; k < nt; k++) {
+      // Diagonal Block factorization: using LAPACK
+      LAPACKE_dpotrf(LAPACK_COL_MAJOR, 'L', ts, Ah[k][k], ts);
+
+      // Triangular systems
+      for (int i = k + 1; i < nt; i++) {
+         cblas_dtrsm(CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, ts, ts, 1.0, Ah[k][k], ts, Ah[k][i], ts);
+      }
+
+      // Update trailing matrix
+      for (int i = k + 1; i < nt; i++) {
+         for (int j = k + 1; j < i; j++) {
+            cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, ts, ts, ts, -1.0, Ah[k][i], ts, Ah[k][j], ts, 1.0, Ah[j][i], ts);
+         }
+         cblas_dsyrk(CblasColMajor, CblasLower, CblasNoTrans, ts, ts, -1.0, Ah[k][i], ts, 1.0, Ah[i][i], ts);
+      }
+   }  
+
+#ifdef VERBOSE
+	printf("> ...end of Cholesky Factorization.\n");
+#endif
+}
+
+float get_time()
+{
+	static double gtod_ref_time_sec = 0.0;
+
+	struct timeval tv;
+	gettimeofday(&tv, NULL);
+
+	// If this is the first invocation of through dclock(), then initialize the
+	// "reference time" global variable to the seconds field of the tv struct.
+	if (gtod_ref_time_sec == 0.0) gtod_ref_time_sec = (double) tv.tv_sec;
+
+	// Normalize the seconds field of the tv struct so that it is relative to the
+	// "reference time" that was recorded during the first invocation of dclock().
+	const double norm_sec = (double) tv.tv_sec - gtod_ref_time_sec;
+
+	// Compute the number of seconds since the reference time.
+	const double t = norm_sec + tv.tv_usec * 1.0e-6;
+
+	return (float) t;
+}
+
+// Robust Check the factorization of the matrix A2
+// Using directly Fortran services: dlacpy_, dtrmm, dlange_
+static int check_factorization(int N, double *A1, double *A2, int LDA, char uplo, double eps)
+{
+#ifdef VERBOSE
+	printf("> Checking the Cholesky Factorization... \n");
+#endif
+	char NORM = 'I', ALL = 'A', UP = 'U', LO = 'L', TR = 'T', NU = 'N', RI = 'R';
+
+	double *Residual = (double *) malloc(N*N*sizeof(double));
+	double *L1       = (double *) malloc(N*N*sizeof(double));
+	double *L2       = (double *) malloc(N*N*sizeof(double));
+	double *work     = (double *) malloc(N*sizeof(double));
+
+	memset((void*)L1, 0, N*N*sizeof(double));
+	memset((void*)L2, 0, N*N*sizeof(double));
+
+	double alpha= 1.0;
+
+	dlacpy_(&ALL, &N, &N, A1, &LDA, Residual, &N);
+
+	/* Dealing with L'L or U'U  */
+	if (uplo == 'U'){
+		dlacpy_(&UP, &N, &N, A2, &LDA, L1, &N);
+		dlacpy_(&UP, &N, &N, A2, &LDA, L2, &N);
+		dtrmm(&LO, &uplo, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
+	} else{
+		dlacpy_(&LO, &N, &N, A2, &LDA, L1, &N);
+		dlacpy_(&LO, &N, &N, A2, &LDA, L2, &N);
+		dtrmm(&RI, &LO, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
+	}
+
+	/* Compute the Residual || A -L'L|| */
+	for (int i = 0; i < N; i++)
+		for (int j = 0; j < N; j++)
+			Residual[j*N+i] = L2[j*N+i] - Residual[j*N+i];
+
+	double Rnorm = dlange_(&NORM, &N, &N, Residual, &N, work);
+	double Anorm = dlange_(&NORM, &N, &N, A1, &N, work);
+
+#ifdef VERBOSE
+	printf("> - ||L'L-A||_oo/(||A||_oo.N.eps) = %e \n",Rnorm / (Anorm*N*eps));
+#endif
+
+	const int info_factorization = isnan(Rnorm/(Anorm*N*eps)) || isinf(Rnorm/(Anorm*N*eps)) || (Rnorm/(Anorm*N*eps) > 60.0);
+
+#ifdef VERBOSE
+	if ( info_factorization) printf("> - Factorization is suspicious!\n");
+	else printf("> - Factorization is CORRECT!\n");
+#endif
+
+	free(Residual); free(L1); free(L2); free(work);
+
+	return info_factorization;
+}
+
+void initialize_matrix(const int n, const int ts, double *matrix)
+{
+#ifdef VERBOSE
+	printf("> Initializing matrix with random values...\n");
+#endif
+
+	int ISEED[4] = {0,0,0,1};
+	int intONE=1;
+
+	for (int i = 0; i < n*n; i+=n) {
+		dlarnv_(&intONE, &ISEED[0], &n, &matrix[i]);
+	}
+
+	for (int i=0; i<n; i++) {
+		for (int j=0; j<n; j++) {
+			matrix[j*n+i] = matrix[j*n+i] + matrix[i*n+j];
+			matrix[i*n+j] = matrix[j*n+i];
+		}
+	}
+
+   // Diagonal values
+	for (int i = 0; i < n; i++) {
+		matrix[i*n+i] += (double) n;
+   }
+}
+
+void gather_block(const int N, const int ts, double *Alin, double *A)
+{
+	for (int i = 0; i < ts; i++)
+		for (int j = 0; j < ts; j++) {
+			A[i*ts + j] = Alin[i*N + j];
+		}
+}
+
+void scatter_block(const int N, const int ts, double *A, double *Alin)
+{
+	for (int i = 0; i < ts; i++)
+		for (int j = 0; j < ts; j++) {
+			Alin[i*N + j] = A[i*ts + j];
+		}
+}
+
+void convert_to_blocks(const int ts, const int DIM, const int N, double Alin[N][N], double *A[DIM][DIM])
+{
+#ifdef VERBOSE
+	printf("> Converting linear matrix to blocks...\n");
+#endif
+   for (int i = 0; i < DIM; i++)
+      for (int j = 0; j < DIM; j++) {
+         gather_block ( N, ts, &Alin[i*ts][j*ts], A[i][j]);
+      }
+}
+
+void convert_to_linear(const int ts, const int DIM, const int N, double *A[DIM][DIM], double Alin[N][N])
+{
+#ifdef VERBOSE
+	printf("> Converting blocked matrix to linear...\n");
+#endif
+   for (int i = 0; i < DIM; i++)
+      for (int j = 0; j < DIM; j++) {
+         scatter_block ( N, ts, A[i][j], (double *) &Alin[i*ts][j*ts]);
+      }
+}
+
+int main(int argc, char* argv[])
+{
+   char *result[3] = {"n/a","pass","FAIL"};
+	const double eps = pow(2.0, -53);
+
+   // If number of arguments is not 4, print help
+   if ( argc != 4) {
+      printf( "%s: matrix_size block_size check[0|1]?\n", argv[0] );
+      exit( -1 );
+   }
+
+   const int  n = atoi(argv[1]); // matrix size
+   const int ts = atoi(argv[2]); // tile size
+   int check    = atoi(argv[3]); // check result?
+
+   // Compute number of tiles
+   const int nt = n / ts;
+   assert((nt*ts) == n); // tile size should divide size
+
+   // Allocate matrix
+   double * const matrix = (double *) malloc(n * n * sizeof(double));
+   assert(matrix != NULL);
+
+   // Initialize matrix
+   initialize_matrix(n, ts, matrix);
+
+   // Allocate original matrix, and duplicate it, for debugging purposes
+   double * const original_matrix = (double *) malloc(n * n * sizeof(double));
+   assert(original_matrix != NULL);
+
+   // Save a copy of the original matrix
+   for (int i = 0; i < n * n; i++ ) {
+      original_matrix[i] = matrix[i];
+   }
+
+   // Set version description: Indirect blocked matrix
+   const char *version = "I-Blocked matrix";
+
+   // Allocate blocked matrix
+   double *Ah[nt][nt];
+   for (int i = 0; i < nt; i++) {
+      for (int j = 0; j < nt; j++) {
+         Ah[i][j] = malloc(ts * ts * sizeof(double));
+         assert(Ah[i][j] != NULL);
+      }
+   }
+
+   // ---------------------------------------
+   // Convert, compute (time), and re-convert
+   // ---------------------------------------
+   convert_to_blocks(ts, nt, n, (double(*)[n]) matrix, Ah);
+   const float tref = get_time();
+   cholesky(ts, nt, (double* (*)[nt]) Ah);
+   const float time = get_time() - tref;
+   convert_to_linear(ts, nt, n, Ah, (double (*)[n]) matrix);
+
+   // Free blocked matrix
+   for (int i = 0; i < nt; i++) {
+      for (int j = 0; j < nt; j++) {
+         assert(Ah[i][j] != NULL);
+         free(Ah[i][j]);
+      }
+   }
+
+   // Check result, if requested
+   if ( check ) {
+      const char uplo = 'L';
+      if ( check_factorization( n, original_matrix, matrix, n, uplo, eps) ) check++;
+   }
+
+   // Free original matrix, not needed anymore
+   free(original_matrix);
+
+   // Compute GFLOPs (Not verified)
+   float gflops = (((1.0 / 3.0) * n * n * n) / ((time) * 1.0e+9));
+
+   // Print results
+#ifdef VERBOSE
+   printf( "\n" );
+   printf( "============ CHOLESKY RESULTS ============\n" );
+   printf( "  test                  %s\n", argv[0]);
+   printf( "  version               %s\n", version);
+   printf( "  matrix size:          %dx%d\n", n, n);
+   printf( "  tile size:            %dx%d\n", ts, ts);
+   printf( "  number of threads:    %d\n", omp_get_max_threads());
+   printf( "  time (s):             %f\n", time);
+   printf( "  performance (gflops): %f\n", gflops);
+   printf( "  check:                %s\n", result[check]);
+   printf( "==========================================\n" );
+#else
+   printf("test, %s, version, %s, n, %d, ts, %d, num_threads, %d, gflops, %f, time, %f, check, %s\n", argv[0], version, n, ts, omp_get_max_threads(), gflops, time, result[check]);
+#endif
+
+   // Free matrix
+   free(matrix);
+
+   return 0;
+}
+
--- a/01-tasks/sudoku-tsk/Makefile
+++ b/01-tasks/sudoku-tsk/Makefile
@ -0,0 +1,25 @@
+BIN_DIR=../bin
+PROGRAM=$(BIN_DIR)/sudoku-tsk
+
+CC=g++
+CFLAGS=-c -O3 -fopenmp -DUSE_SEQUENTIAL_CUTOFF
+LDFLAGS=-fopenmp
+
+OBJS=SudokuBoard.o sudoku.o
+
+all: $(PROGRAM) 
+
+$(PROGRAM): $(OBJS)
+	$(CC) -o $@ $^ $(LDFLAGS)
+
+%.o: %.cpp
+	$(CC) $(CFLAGS) -o $@ $^ 
+
+$(BIN_DIR): 
+	mkdir $@
+
+clean:
+	rm -rf $(PROGRAM) *.o
+
+wipe: clean
+	rm -rf *.out *.err
--- a/01-tasks/sudoku-tsk/SudokuBoard.cpp
+++ b/01-tasks/sudoku-tsk/SudokuBoard.cpp
@ -0,0 +1,119 @@
+/*                    
+ *  This file is part of Christian's OpenMP Task-parallel Sudoku Solver
+ *  
+ *  Copyright (C) 2013 by Christian Terboven <christian@terboven.com>
+ *                                                                       
+ *  This program is free software; you can redistribute it and/or modify 
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include "SudokuBoard.h"
+
+#include <cstring>
+
+CSudokuBoard::CSudokuBoard(int fsize, int bsize)
+	: field_size(fsize), block_size(bsize), solutions(-1)
+{
+	field = new int[field_size*field_size];
+}
+
+
+CSudokuBoard::CSudokuBoard(const CSudokuBoard& other)
+	: field_size(other.getFieldSize()), block_size(other.getBlockSize()), solutions(other.getNumSolutions())
+{
+	field = new int[field_size*field_size];
+	std::memcpy(field, other.field, sizeof(int) * field_size*field_size);
+}
+
+
+CSudokuBoard::~CSudokuBoard(void)
+{
+	delete[] field;
+}
+
+
+bool CSudokuBoard::loadFromFile(char *filename)
+{
+  std::ifstream ifile(filename);
+  
+  if (!ifile) {
+    std::cout << "There was an error opening the input file " << filename << std::endl;
+    std::cout << std::endl;
+    return false;
+  }
+
+  for (int i = 0; i < this->field_size; i++) {
+    for (int j = 0; j < this->field_size; j++) {
+	  ifile >> this->field[ACCESS(i,j)];
+    }
+  }
+
+  return true;
+}
+
+
+void CSudokuBoard::printBoard()
+{
+	for(int i = 0; i < field_size; i++) {
+		for(int j = 0; j < field_size; j++) {
+			std::cout << std::setw(3) << 
+				this->field[ACCESS(i,j)] 
+				<< " ";
+		}
+		std::cout << std::endl;
+	}
+}
+
+
+bool CSudokuBoard::check(int x, int y, int value)
+{
+	if(checkHorizontal(y, value))
+		return true;
+	if(checkVertical(x, value))
+		return true;
+	if(checkBox(x, y, value))
+		return true;
+	return false;
+}
+
+
+bool CSudokuBoard::checkHorizontal(int y, int value) {
+	for(int i = 0; i < field_size; i++)
+		if(field[ACCESS(y,i)] == value)
+			return true;
+	return false;
+}
+
+
+bool CSudokuBoard::checkVertical(int x, int value) {
+	for(int i = 0; i < field_size; i++)
+		if(field[ACCESS(i,x)] == value)
+			return true;
+	return false;
+}
+
+
+bool CSudokuBoard::checkBox(int x, int y, int value) {
+	// find suitable box edge
+	int x_box = (int)(x / block_size) * block_size;
+	int y_box = (int)(y / block_size) * block_size;
+
+	for(int i = y_box; i < y_box + block_size; i++)
+		for(int j = x_box; j < x_box + block_size; j++)
+			if(field[ACCESS(i,j)] == value)
+				return true;
+  
+	return false;
+}
--- a/01-tasks/sudoku-tsk/SudokuBoard.h
+++ b/01-tasks/sudoku-tsk/SudokuBoard.h
@ -0,0 +1,96 @@
+/*                    
+ *  This file is part of Christian's OpenMP Task-parallel Sudoku Solver
+ *  
+ *  Copyright (C) 2013 by Christian Terboven <christian@terboven.com>
+ *                                                                       
+ *  This program is free software; you can redistribute it and/or modify 
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <vector>
+
+#define ACCESS(x, y) (this->field_size*(x) + (y))
+
+/**
+ * @brief Representation of the Sudoku board, includes solver and utility functions
+ */
+class CSudokuBoard
+{
+public:
+	CSudokuBoard(int fsize, int bsize);
+	CSudokuBoard(const CSudokuBoard& other);
+	~CSudokuBoard(void);
+
+	inline int getNumSolutions() const {
+		return this->solutions;
+	}
+	
+	inline void incrementSolutionCounter() {
+		if (this->solutions == -1)
+			this->solutions = 1;
+		else
+			this->solutions++;
+	}
+	
+	inline int getFieldSize() const {
+		return this->field_size;
+	}
+	
+	inline int getBlockSize() const {
+		return this->block_size;
+	}
+	
+	inline int get(int x, int y) const {
+		return this->field[ACCESS(x,y)];
+	}
+	
+	inline void set(int x, int y, int value) {
+		this->field[ACCESS(x, y)] = value;
+	}
+
+	/**
+	 * Read Sudoku template from file
+	 * @param filename name of file to read input from
+	 * @return true if reading file was successful, false if not
+	 */
+	bool loadFromFile(char *filename);
+
+	/**
+	 * Print the Sudoku board to stdout
+	 */
+	void printBoard();
+
+	/**
+	 * Check whether value already used
+	 * @return true if already used, false if not
+	 */
+	bool check(int x, int y, int value);
+
+
+private:
+	bool checkHorizontal(int y, int value);
+	bool checkVertical(int x, int value);
+	bool checkBox(int x, int y, int value);
+	
+	int field_size;
+	int block_size;
+
+	int *field;
+
+	int solutions;
+};
--- a/01-tasks/sudoku-tsk/sudoku-solved.cpp
+++ b/01-tasks/sudoku-tsk/sudoku-solved.cpp
@ -0,0 +1,177 @@
+/*                    
+ *  This file is part of Christian's OpenMP Task-parallel Sudoku Solver
+ *  
+ *  Copyright (C) 2013 by Christian Terboven <christian@terboven.com>
+ *                                                                       
+ *  This program is free software; you can redistribute it and/or modify 
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include <iostream>
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <omp.h>
+
+#include "SudokuBoard.h"
+
+
+int found_sudokus = 0;
+
+
+/**
+  * Solve the Sudoku puzzle by finding and counting all solutions (if all are requested)
+  */
+bool solve(int x, int y, CSudokuBoard* sudoku, bool printAnyFoundSolution = true, bool findAllSolutions = true)
+{
+	if(x == sudoku->getFieldSize()) {                 // end of line
+		y++;
+		x = 0;
+		if(y == sudoku->getFieldSize())           // end
+			return true;
+	}   
+  
+	if(sudoku->get(y, x) > 0)                         // field already set
+		return solve(x+1, y, sudoku, printAnyFoundSolution, findAllSolutions);    // tackle next field
+    
+	for(int i = 1; i <= sudoku->getFieldSize(); i++) {            // try all numbers
+		if(!sudoku->check(x, y, i)) {
+			sudoku->set(y, x, i);                  // if number fits, set it
+			if(solve(x+1, y, sudoku, printAnyFoundSolution, findAllSolutions)) {  // tackle next field
+#pragma omp atomic
+				found_sudokus++;
+				sudoku->incrementSolutionCounter();     // solution found :-)
+				if (printAnyFoundSolution) {
+					std::cout << "The following is a valid solution:" << std::endl;
+					sudoku->printBoard();
+					std::cout << std::endl;
+				}
+				if (!findAllSolutions) {
+					return true;                // return, as only one solution was asked for
+				}
+			}
+		}
+	}
+	
+	sudoku->set(y, x, 0);                            // no solution found, reset field
+	return false;
+}
+
+
+
+/**
+  * Solve the Sudoku puzzle in parallel by finding and counting all solutions
+  */
+bool solve_parallel(int x, int y, CSudokuBoard* sudoku, bool printAnyFoundSolution = true)
+{
+	if (x == sudoku->getFieldSize()) {                       // end of line
+		y++;
+		x = 0;
+		if(y == sudoku->getFieldSize())                     // end
+			return true;
+	}   
+  
+	if (sudoku->get(y, x) > 0) {                         // field already set
+		return solve_parallel(x+1, y, sudoku, printAnyFoundSolution);    // tackle next field
+	}
+    
+#if USE_SEQUENTIAL_CUTOFF
+	if ( y > 1 ) {
+		return solve(x, y, sudoku, printAnyFoundSolution);
+	}
+#endif
+	
+	for (int i = 1; i <= sudoku->getFieldSize(); i++) {      // try all numbers
+		if (!sudoku->check(x, y, i)) {
+#pragma omp task firstprivate(i,x,y,sudoku)
+			{
+				CSudokuBoard new_sudoku(*sudoku);
+				new_sudoku.set(y, x, i);                    // if number fits, set it
+				
+				if (solve_parallel(x+1, y, &new_sudoku, printAnyFoundSolution)) {  // tackle next field
+#pragma omp atomic
+					found_sudokus++;
+	//				sudoku->incrementSolutionCounter();     // solution found :-)
+					if (printAnyFoundSolution) {
+						std::cout << "The following is a valid solution:" << std::endl;
+						new_sudoku.printBoard();
+						std::cout << std::endl;
+					}
+				}
+			}
+		}
+	}
+	
+#pragma omp taskwait
+	sudoku->set(y, x, 0);                            // no solution found, reset field
+	return false;
+}
+
+
+
+/** @brief program entry point
+ */
+int main(int argc, char* argv[]) {
+	// measure the time
+	double t3, t4;
+	int nthreads = 0;
+
+	// expect three command line arguments: field size, block size, and input file
+	if (argc != 4) {
+		std::cout << "Usage: sudoku.exe <field size> <block size> <input filename>" << std::endl;
+		std::cout << std::endl;
+		return -1;
+	}
+	else {
+		CSudokuBoard *sudoku1 = new CSudokuBoard(atoi(argv[1]), atoi(argv[2]));
+		if (!sudoku1->loadFromFile(argv[3])) {
+			std::cout << "There was an error reading a Sudoku template from " << argv[3] << std::endl;
+			std::cout << std::endl;
+			return -1;
+		}
+
+		// print the Sudoku board template
+		std::cout << "Given Sudoku template" << std::endl;
+		sudoku1->printBoard();
+
+		// solve the Sudoku by finding (and printing) all solutions
+		t3 = omp_get_wtime();
+      #pragma omp parallel sections
+		{
+			solve_parallel(0, 0, sudoku1, false);
+		}		
+		t4 = omp_get_wtime();
+
+#pragma omp parallel 
+		{
+#pragma omp master
+			nthreads = omp_get_num_threads();
+		}
+
+		std::cout << std::endl;
+		std::cout << "In total there were " << sudoku1->getNumSolutions() << " solutions found in serial." << std::endl;
+		std::cout << "In total there were " << found_sudokus << " solutions found in parallel " << std::endl;
+		std::cout << std::endl;
+		
+		delete sudoku1;
+	}
+	
+	// print the time
+	std::cout << "Parallel computation took " << t4 - t3 << " seconds (" 
+			<< nthreads << " threads)." << std::endl << std::endl;
+
+	return 0;
+}
--- a/01-tasks/sudoku-tsk/sudoku.cpp
+++ b/01-tasks/sudoku-tsk/sudoku.cpp
@ -0,0 +1,169 @@
+/*                    
+ *  This file is part of Christian's OpenMP Task-parallel Sudoku Solver
+ *  
+ *  Copyright (C) 2013 by Christian Terboven <christian@terboven.com>
+ *                                                                       
+ *  This program is free software; you can redistribute it and/or modify 
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include <iostream>
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <omp.h>
+
+#include "SudokuBoard.h"
+
+
+int found_sudokus = 0;
+
+
+/**
+  * Solve the Sudoku puzzle by finding and counting all solutions (if all are requested)
+  */
+bool solve(int x, int y, CSudokuBoard* sudoku, bool printAnyFoundSolution = true, bool findAllSolutions = true)
+{
+	if(x == sudoku->getFieldSize()) {                 // end of line
+		y++;
+		x = 0;
+		if(y == sudoku->getFieldSize())           // end
+			return true;
+	}   
+  
+	if(sudoku->get(y, x) > 0)                         // field already set
+		return solve(x+1, y, sudoku, printAnyFoundSolution, findAllSolutions);    // tackle next field
+    
+	for(int i = 1; i <= sudoku->getFieldSize(); i++) {            // try all numbers
+		if(!sudoku->check(x, y, i)) {
+			sudoku->set(y, x, i);                  // if number fits, set it
+			if(solve(x+1, y, sudoku, printAnyFoundSolution, findAllSolutions)) {  // tackle next field
+#pragma omp atomic
+				found_sudokus++;
+				sudoku->incrementSolutionCounter();     // solution found :-)
+				if (printAnyFoundSolution) {
+					std::cout << "The following is a valid solution:" << std::endl;
+					sudoku->printBoard();
+					std::cout << std::endl;
+				}
+				if (!findAllSolutions) {
+					return true;                // return, as only one solution was asked for
+				}
+			}
+		}
+	}
+	
+	sudoku->set(y, x, 0);                            // no solution found, reset field
+	return false;
+}
+
+
+
+/**
+  * Solve the Sudoku puzzle in parallel by finding and counting all solutions
+  */
+bool solve_parallel(int x, int y, CSudokuBoard* sudoku, bool printAnyFoundSolution = true)
+{
+	if (x == sudoku->getFieldSize()) {                       // end of line
+		y++;
+		x = 0;
+		if(y == sudoku->getFieldSize())                     // end
+			return true;
+	}   
+  
+	if (sudoku->get(y, x) > 0) {                         // field already set
+		return solve_parallel(x+1, y, sudoku, printAnyFoundSolution);    // tackle next field
+	}
+    
+#if USE_SEQUENTIAL_CUTOFF
+	if ( y > 1 ) {
+		return solve(x, y, sudoku, printAnyFoundSolution);
+	}
+#endif
+	
+// TODO: Task candidates (for each possible value?)
+	for (int i = 1; i <= sudoku->getFieldSize(); i++) {      // try all numbers
+		if (!sudoku->check(x, y, i)) {
+			{
+				CSudokuBoard new_sudoku(*sudoku);
+				new_sudoku.set(y, x, i);                    // if number fits, set it
+				
+				if (solve_parallel(x+1, y, &new_sudoku, printAnyFoundSolution)) {  // tackle next field
+#pragma omp atomic
+					found_sudokus++;
+	//				sudoku->incrementSolutionCounter();     // solution found :-)
+					if (printAnyFoundSolution) {
+						std::cout << "The following is a valid solution:" << std::endl;
+						new_sudoku.printBoard();
+						std::cout << std::endl;
+					}
+				}
+			}
+		}
+	}
+
+// TODO: After creating tasks for this level we will need to synchronize
+	sudoku->set(y, x, 0);                            // no solution found, reset field
+	return false;
+}
+
+
+
+/** @brief program entry point
+ */
+int main(int argc, char* argv[]) {
+	// measure the time
+	double t3, t4;
+
+	// expect three command line arguments: field size, block size, and input file
+	if (argc != 4) {
+		std::cout << "Usage: sudoku.exe <field size> <block size> <input filename>" << std::endl;
+		std::cout << std::endl;
+		return -1;
+	}
+	else {
+		CSudokuBoard *sudoku1 = new CSudokuBoard(atoi(argv[1]), atoi(argv[2]));
+		if (!sudoku1->loadFromFile(argv[3])) {
+			std::cout << "There was an error reading a Sudoku template from " << argv[3] << std::endl;
+			std::cout << std::endl;
+			return -1;
+		}
+
+		// print the Sudoku board template
+		std::cout << "Given Sudoku template" << std::endl;
+		sudoku1->printBoard();
+
+		// solve the Sudoku by finding (and printing) all solutions
+		t3 = omp_get_wtime();
+      // TODO: Sudoku solver needs to execute within a parallel region
+		{
+			solve_parallel(0, 0, sudoku1, false);
+		}		
+		t4 = omp_get_wtime();
+
+		std::cout << std::endl;
+		std::cout << "In total there were " << found_sudokus << " solutions found in parallel " << std::endl;
+		std::cout << std::endl;
+		
+		delete sudoku1;
+	}
+	
+	// print the time
+	std::cout << "Parallel computation took " << t4 - t3 << " seconds (" 
+			<< omp_get_max_threads() << " threads)." << std::endl << std::endl;
+
+	return 0;
+}
--- a/02-dependencies/Makefile
+++ b/02-dependencies/Makefile
@ -0,0 +1,14 @@
+ALL_RECURSIVE=all-recursive clean-recursive 
+
+DIRS=$(shell ls -d *-* | sed '/_/d' 2>/dev/null)
+
+all:   $(CONFIG) all-recursive
+clean: $(CONFIG) clean-recursive
+
+$(ALL_RECURSIVE):
+	@failcom='exit 1';\
+   target=`echo $@ | sed s/-recursive//`; \
+   for subdir in $(DIRS); do \
+      (cd $$subdir && $(MAKE) $$target) || eval $$failcom; \
+   done;
+
--- a/02-dependencies/bin/.gitdir
+++ b/02-dependencies/bin/.gitdir
--- a/02-dependencies/bin/run-cholesky.sh
+++ b/02-dependencies/bin/run-cholesky.sh
@ -0,0 +1,29 @@
+#!/bin/bash
+#SBATCH --job-name=cholesky
+#SBATCH --workdir=.
+#SBATCH --output=cholesky_%j.out
+#SBATCH --error=cholesky_%j.err
+#SBATCH --cpus-per-task=48
+#SBATCH --ntasks=1
+#SBATCH --time=00:30:00
+#SBATCH --qos=debug
+
+export IFS=";"
+
+THREADS="01;02;04;06;08;10;12;14;16;18;20;22;24;26;28;30;32;34;36;38;40;42;44;46;48"
+#THREADS="01;02;04;06;08;10;12"
+
+#MSIZES="2048;4096;8192"
+MSIZES="8192"
+MSIZES="16384"
+#MSIZES="2048"
+
+BSIZES="256"
+
+for MS in $MSIZES; do
+  for BS in $BSIZES; do
+    for threads in $THREADS; do
+      OMP_NUM_THREADS=$threads ./cholesky-tsk-dep $MS $BS 0
+    done
+  done
+done
--- a/02-dependencies/cholesky-tsk-dep/Makefile
+++ b/02-dependencies/cholesky-tsk-dep/Makefile
@ -0,0 +1,23 @@
+BIN_DIR=../bin
+PROGRAM=$(BIN_DIR)/cholesky-tsk-dep
+
+MKL_HOME=$(MKLROOT)
+
+CC=gcc
+CFLAGS=-O3 -std=gnu99 -I$(MKL_HOME)/include -fopenmp
+LDFLAGS=-L$(MKL_HOME)/lib/intel64 -lmkl_sequential -lmkl_core -lmkl_rt -lpthread -lm -fopenmp
+#VERBOSE=-DVERBOSE
+
+all: $(PROGRAM) 
+
+$(PROGRAM): cholesky.c 
+	$(CC) $(CFLAGS) $(VERBOSE) -o $@ $^ $(LDFLAGS)
+
+$(BIN_DIR): 
+	mkdir $@
+
+clean:
+	rm -rf $(PROGRAM) *.o
+
+wipe: clean
+	rm -rf *.out *.err
--- a/02-dependencies/cholesky-tsk-dep/cholesky-solved.c
+++ b/02-dependencies/cholesky-tsk-dep/cholesky-solved.c
@ -0,0 +1,296 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <assert.h>
+
+#include <sys/time.h>
+#include <sys/times.h>
+
+#include <math.h>
+#include <mkl.h>
+
+#include "omp.h"
+
+#if !defined(_OPENMP)
+    int omp_get_max_threads() { return 1; }
+    int omp_get_num_threads() { return 1; }
+#endif
+
+void cholesky(int ts, int nt, double* Ah[nt][nt])
+{
+#ifdef VERBOSE
+	printf("> Computing Cholesky Factorization: indirect blocked matrix...\n");
+#endif
+
+   #pragma omp parallel
+   #pragma omp single
+   for (int k = 0; k < nt; k++) {
+      // Diagonal Block factorization: using LAPACK
+      #pragma omp task depend(inout: Ah[k][k])
+      LAPACKE_dpotrf(LAPACK_COL_MAJOR, 'L', ts, Ah[k][k], ts);
+
+      // Triangular systems
+      for (int i = k + 1; i < nt; i++) {
+         #pragma omp task depend(in: Ah[k][k]) depend(inout: Ah[k][i])
+         cblas_dtrsm(CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, ts, ts, 1.0, Ah[k][k], ts, Ah[k][i], ts);
+      }
+
+      // Update trailing matrix
+      for (int i = k + 1; i < nt; i++) {
+         for (int j = k + 1; j < i; j++) {
+            #pragma omp task depend(in: Ah[k][i], Ah[k][j]) depend(inout: Ah[j][i])
+            cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, ts, ts, ts, -1.0, Ah[k][i], ts, Ah[k][j], ts, 1.0, Ah[j][i], ts);
+         }
+         #pragma omp task depend(in: Ah[k][i]) depend(inout: Ah[i][i])
+         cblas_dsyrk(CblasColMajor, CblasLower, CblasNoTrans, ts, ts, -1.0, Ah[k][i], ts, 1.0, Ah[i][i], ts);
+      }
+   }
+
+   #pragma omp taskwait
+#ifdef VERBOSE
+	printf("> ...end of Cholesky Factorization.\n");
+#endif
+}
+
+float get_time()
+{
+	static double gtod_ref_time_sec = 0.0;
+
+	struct timeval tv;
+	gettimeofday(&tv, NULL);
+
+	// If this is the first invocation of through dclock(), then initialize the
+	// "reference time" global variable to the seconds field of the tv struct.
+	if (gtod_ref_time_sec == 0.0) gtod_ref_time_sec = (double) tv.tv_sec;
+
+	// Normalize the seconds field of the tv struct so that it is relative to the
+	// "reference time" that was recorded during the first invocation of dclock().
+	const double norm_sec = (double) tv.tv_sec - gtod_ref_time_sec;
+
+	// Compute the number of seconds since the reference time.
+	const double t = norm_sec + tv.tv_usec * 1.0e-6;
+
+	return (float) t;
+}
+
+// Robust Check the factorization of the matrix A2
+// Using directly Fortran services: dlacpy_, dtrmm, dlange_
+static int check_factorization(int N, double *A1, double *A2, int LDA, char uplo, double eps)
+{
+#ifdef VERBOSE
+	printf("> Checking the Cholesky Factorization... \n");
+#endif
+	char NORM = 'I', ALL = 'A', UP = 'U', LO = 'L', TR = 'T', NU = 'N', RI = 'R';
+
+	double *Residual = (double *) malloc(N*N*sizeof(double));
+	double *L1       = (double *) malloc(N*N*sizeof(double));
+	double *L2       = (double *) malloc(N*N*sizeof(double));
+	double *work     = (double *) malloc(N*sizeof(double));
+
+	memset((void*)L1, 0, N*N*sizeof(double));
+	memset((void*)L2, 0, N*N*sizeof(double));
+
+	double alpha= 1.0;
+
+	dlacpy_(&ALL, &N, &N, A1, &LDA, Residual, &N);
+
+	/* Dealing with L'L or U'U  */
+	if (uplo == 'U'){
+		dlacpy_(&UP, &N, &N, A2, &LDA, L1, &N);
+		dlacpy_(&UP, &N, &N, A2, &LDA, L2, &N);
+		dtrmm(&LO, &uplo, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
+	} else{
+		dlacpy_(&LO, &N, &N, A2, &LDA, L1, &N);
+		dlacpy_(&LO, &N, &N, A2, &LDA, L2, &N);
+		dtrmm(&RI, &LO, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
+	}
+
+	/* Compute the Residual || A -L'L|| */
+	for (int i = 0; i < N; i++)
+		for (int j = 0; j < N; j++)
+			Residual[j*N+i] = L2[j*N+i] - Residual[j*N+i];
+
+	double Rnorm = dlange_(&NORM, &N, &N, Residual, &N, work);
+	double Anorm = dlange_(&NORM, &N, &N, A1, &N, work);
+
+#ifdef VERBOSE
+	printf("> - ||L'L-A||_oo/(||A||_oo.N.eps) = %e \n",Rnorm / (Anorm*N*eps));
+#endif
+
+	const int info_factorization = isnan(Rnorm/(Anorm*N*eps)) || isinf(Rnorm/(Anorm*N*eps)) || (Rnorm/(Anorm*N*eps) > 60.0);
+
+#ifdef VERBOSE
+	if ( info_factorization) printf("> - Factorization is suspicious!\n");
+	else printf("> - Factorization is CORRECT!\n");
+#endif
+
+	free(Residual); free(L1); free(L2); free(work);
+
+	return info_factorization;
+}
+
+void initialize_matrix(const int n, const int ts, double *matrix)
+{
+#ifdef VERBOSE
+	printf("> Initializing matrix with random values...\n");
+#endif
+
+	int ISEED[4] = {0,0,0,1};
+	int intONE=1;
+
+	for (int i = 0; i < n*n; i+=n) {
+		dlarnv_(&intONE, &ISEED[0], &n, &matrix[i]);
+	}
+
+	for (int i=0; i<n; i++) {
+		for (int j=0; j<n; j++) {
+			matrix[j*n+i] = matrix[j*n+i] + matrix[i*n+j];
+			matrix[i*n+j] = matrix[j*n+i];
+		}
+	}
+
+   // Diagonal values
+	for (int i = 0; i < n; i++) {
+		matrix[i*n+i] += (double) n;
+   }
+}
+
+void gather_block(const int N, const int ts, double *Alin, double *A)
+{
+	for (int i = 0; i < ts; i++)
+		for (int j = 0; j < ts; j++) {
+			A[i*ts + j] = Alin[i*N + j];
+		}
+}
+
+void scatter_block(const int N, const int ts, double *A, double *Alin)
+{
+	for (int i = 0; i < ts; i++)
+		for (int j = 0; j < ts; j++) {
+			Alin[i*N + j] = A[i*ts + j];
+		}
+}
+
+void convert_to_blocks(const int ts, const int DIM, const int N, double Alin[N][N], double *A[DIM][DIM])
+{
+#ifdef VERBOSE
+	printf("> Converting linear matrix to blocks...\n");
+#endif
+   for (int i = 0; i < DIM; i++)
+      for (int j = 0; j < DIM; j++) {
+         gather_block ( N, ts, &Alin[i*ts][j*ts], A[i][j]);
+      }
+}
+
+void convert_to_linear(const int ts, const int DIM, const int N, double *A[DIM][DIM], double Alin[N][N])
+{
+#ifdef VERBOSE
+	printf("> Converting blocked matrix to linear...\n");
+#endif
+   for (int i = 0; i < DIM; i++)
+      for (int j = 0; j < DIM; j++) {
+         scatter_block ( N, ts, A[i][j], (double *) &Alin[i*ts][j*ts]);
+      }
+}
+
+int main(int argc, char* argv[])
+{
+   char *result[3] = {"n/a","pass","FAIL"};
+	const double eps = pow(2.0, -53);
+
+   // If number of arguments is not 4, print help
+   if ( argc != 4) {
+      printf( "%s: matrix_size block_size check[0|1]?\n", argv[0] );
+      exit( -1 );
+   }
+
+   const int  n = atoi(argv[1]); // matrix size
+   const int ts = atoi(argv[2]); // tile size
+   int check    = atoi(argv[3]); // check result?
+
+   // Compute number of tiles
+   const int nt = n / ts;
+   assert((nt*ts) == n); // tile size should divide size
+
+   // Allocate matrix
+   double * const matrix = (double *) malloc(n * n * sizeof(double));
+   assert(matrix != NULL);
+
+   // Initialize matrix
+   initialize_matrix(n, ts, matrix);
+
+   // Allocate original matrix, and duplicate it, for debugging purposes
+   double * const original_matrix = (double *) malloc(n * n * sizeof(double));
+   assert(original_matrix != NULL);
+
+   // Save a copy of the original matrix
+   for (int i = 0; i < n * n; i++ ) {
+      original_matrix[i] = matrix[i];
+   }
+
+   // Set version description: Indirect blocked matrix
+   const char *version = "I-Blocked matrix";
+
+   // Allocate blocked matrix
+   double *Ah[nt][nt];
+   for (int i = 0; i < nt; i++) {
+      for (int j = 0; j < nt; j++) {
+         Ah[i][j] = malloc(ts * ts * sizeof(double));
+         assert(Ah[i][j] != NULL);
+      }
+   }
+
+   // ---------------------------------------
+   // Convert, compute (time), and re-convert
+   // ---------------------------------------
+   convert_to_blocks(ts, nt, n, (double(*)[n]) matrix, Ah);
+   const float tref = get_time();
+   cholesky(ts, nt, (double* (*)[nt]) Ah);
+   const float time = get_time() - tref;
+   convert_to_linear(ts, nt, n, Ah, (double (*)[n]) matrix);
+
+   // Free blocked matrix
+   for (int i = 0; i < nt; i++) {
+      for (int j = 0; j < nt; j++) {
+         assert(Ah[i][j] != NULL);
+         free(Ah[i][j]);
+      }
+   }
+
+   // Check result, if requested
+   if ( check ) {
+      const char uplo = 'L';
+      if ( check_factorization( n, original_matrix, matrix, n, uplo, eps) ) check++;
+   }
+
+   // Free original matrix, not needed anymore
+   free(original_matrix);
+
+   // Compute GFLOPs (Not verified)
+   float gflops = (((1.0 / 3.0) * n * n * n) / ((time) * 1.0e+9));
+
+   // Print results
+#ifdef VERBOSE
+   printf( "\n" );
+   printf( "============ CHOLESKY RESULTS ============\n" );
+   printf( "  test                  %s\n", argv[0]);
+   printf( "  version               %s\n", version);
+   printf( "  matrix size:          %dx%d\n", n, n);
+   printf( "  tile size:            %dx%d\n", ts, ts);
+   printf( "  number of threads:    %d\n", omp_get_max_threads());
+   printf( "  time (s):             %f\n", time);
+   printf( "  performance (gflops): %f\n", gflops);
+   printf( "  check:                %s\n", result[check]);
+   printf( "==========================================\n" );
+#else
+   printf("test, %s, version, %s, n, %d, ts, %d, num_threads, %d, gflops, %f, time, %f, check, %s\n", argv[0], version, n, ts, omp_get_max_threads(), gflops, time, result[check]);
+#endif
+
+   // Free matrix
+   free(matrix);
+
+   return 0;
+}
+
--- a/02-dependencies/cholesky-tsk-dep/cholesky.c
+++ b/02-dependencies/cholesky-tsk-dep/cholesky.c
@ -0,0 +1,293 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <assert.h>
+
+#include <sys/time.h>
+#include <sys/times.h>
+
+#include <math.h>
+#include <mkl.h>
+
+#include "omp.h"
+
+#if !defined(_OPENMP)
+    int omp_get_max_threads() { return 1; }
+    int omp_get_num_threads() { return 1; }
+#endif
+
+void cholesky(int ts, int nt, double* Ah[nt][nt])
+{
+#ifdef VERBOSE
+	printf("> Computing Cholesky Factorization: indirect blocked matrix...\n");
+#endif
+
+   // TODO: Parallel region is required in order to run in parallel.
+   // TODO: Kernel invocations could be task candidates.
+   // TODO: Add scheduler restrictions between tasks using dependences.
+   for (int k = 0; k < nt; k++) {
+      // Diagonal Block factorization: using LAPACK
+      LAPACKE_dpotrf(LAPACK_COL_MAJOR, 'L', ts, Ah[k][k], ts);
+
+      // Triangular systems
+      for (int i = k + 1; i < nt; i++) {
+         cblas_dtrsm(CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, ts, ts, 1.0, Ah[k][k], ts, Ah[k][i], ts);
+      }
+
+      // Update trailing matrix
+      for (int i = k + 1; i < nt; i++) {
+         for (int j = k + 1; j < i; j++) {
+            cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, ts, ts, ts, -1.0, Ah[k][i], ts, Ah[k][j], ts, 1.0, Ah[j][i], ts);
+         }
+         cblas_dsyrk(CblasColMajor, CblasLower, CblasNoTrans, ts, ts, -1.0, Ah[k][i], ts, 1.0, Ah[i][i], ts);
+      }
+   }
+
+// TODO: we need to guarantee all the work has finished at this point
+#ifdef VERBOSE
+	printf("> ...end of Cholesky Factorization.\n");
+#endif
+}
+
+float get_time()
+{
+	static double gtod_ref_time_sec = 0.0;
+
+	struct timeval tv;
+	gettimeofday(&tv, NULL);
+
+	// If this is the first invocation of through dclock(), then initialize the
+	// "reference time" global variable to the seconds field of the tv struct.
+	if (gtod_ref_time_sec == 0.0) gtod_ref_time_sec = (double) tv.tv_sec;
+
+	// Normalize the seconds field of the tv struct so that it is relative to the
+	// "reference time" that was recorded during the first invocation of dclock().
+	const double norm_sec = (double) tv.tv_sec - gtod_ref_time_sec;
+
+	// Compute the number of seconds since the reference time.
+	const double t = norm_sec + tv.tv_usec * 1.0e-6;
+
+	return (float) t;
+}
+
+// Robust Check the factorization of the matrix A2
+// Using directly Fortran services: dlacpy_, dtrmm, dlange_
+static int check_factorization(int N, double *A1, double *A2, int LDA, char uplo, double eps)
+{
+#ifdef VERBOSE
+	printf("> Checking the Cholesky Factorization... \n");
+#endif
+	char NORM = 'I', ALL = 'A', UP = 'U', LO = 'L', TR = 'T', NU = 'N', RI = 'R';
+
+	double *Residual = (double *) malloc(N*N*sizeof(double));
+	double *L1       = (double *) malloc(N*N*sizeof(double));
+	double *L2       = (double *) malloc(N*N*sizeof(double));
+	double *work     = (double *) malloc(N*sizeof(double));
+
+	memset((void*)L1, 0, N*N*sizeof(double));
+	memset((void*)L2, 0, N*N*sizeof(double));
+
+	double alpha= 1.0;
+
+	dlacpy_(&ALL, &N, &N, A1, &LDA, Residual, &N);
+
+	/* Dealing with L'L or U'U  */
+	if (uplo == 'U'){
+		dlacpy_(&UP, &N, &N, A2, &LDA, L1, &N);
+		dlacpy_(&UP, &N, &N, A2, &LDA, L2, &N);
+		dtrmm(&LO, &uplo, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
+	} else{
+		dlacpy_(&LO, &N, &N, A2, &LDA, L1, &N);
+		dlacpy_(&LO, &N, &N, A2, &LDA, L2, &N);
+		dtrmm(&RI, &LO, &TR, &NU, &N, &N, &alpha, L1, &N, L2, &N);
+	}
+
+	/* Compute the Residual || A -L'L|| */
+	for (int i = 0; i < N; i++)
+		for (int j = 0; j < N; j++)
+			Residual[j*N+i] = L2[j*N+i] - Residual[j*N+i];
+
+	double Rnorm = dlange_(&NORM, &N, &N, Residual, &N, work);
+	double Anorm = dlange_(&NORM, &N, &N, A1, &N, work);
+
+#ifdef VERBOSE
+	printf("> - ||L'L-A||_oo/(||A||_oo.N.eps) = %e \n",Rnorm / (Anorm*N*eps));
+#endif
+
+	const int info_factorization = isnan(Rnorm/(Anorm*N*eps)) || isinf(Rnorm/(Anorm*N*eps)) || (Rnorm/(Anorm*N*eps) > 60.0);
+
+#ifdef VERBOSE
+	if ( info_factorization) printf("> - Factorization is suspicious!\n");
+	else printf("> - Factorization is CORRECT!\n");
+#endif
+
+	free(Residual); free(L1); free(L2); free(work);
+
+	return info_factorization;
+}
+
+void initialize_matrix(const int n, const int ts, double *matrix)
+{
+#ifdef VERBOSE
+	printf("> Initializing matrix with random values...\n");
+#endif
+
+	int ISEED[4] = {0,0,0,1};
+	int intONE=1;
+
+	for (int i = 0; i < n*n; i+=n) {
+		dlarnv_(&intONE, &ISEED[0], &n, &matrix[i]);
+	}
+
+	for (int i=0; i<n; i++) {
+		for (int j=0; j<n; j++) {
+			matrix[j*n+i] = matrix[j*n+i] + matrix[i*n+j];
+			matrix[i*n+j] = matrix[j*n+i];
+		}
+	}
+
+   // Diagonal values
+	for (int i = 0; i < n; i++) {
+		matrix[i*n+i] += (double) n;
+   }
+}
+
+void gather_block(const int N, const int ts, double *Alin, double *A)
+{
+	for (int i = 0; i < ts; i++)
+		for (int j = 0; j < ts; j++) {
+			A[i*ts + j] = Alin[i*N + j];
+		}
+}
+
+void scatter_block(const int N, const int ts, double *A, double *Alin)
+{
+	for (int i = 0; i < ts; i++)
+		for (int j = 0; j < ts; j++) {
+			Alin[i*N + j] = A[i*ts + j];
+		}
+}
+
+void convert_to_blocks(const int ts, const int DIM, const int N, double Alin[N][N], double *A[DIM][DIM])
+{
+#ifdef VERBOSE
+	printf("> Converting linear matrix to blocks...\n");
+#endif
+   for (int i = 0; i < DIM; i++)
+      for (int j = 0; j < DIM; j++) {
+         gather_block ( N, ts, &Alin[i*ts][j*ts], A[i][j]);
+      }
+}
+
+void convert_to_linear(const int ts, const int DIM, const int N, double *A[DIM][DIM], double Alin[N][N])
+{
+#ifdef VERBOSE
+	printf("> Converting blocked matrix to linear...\n");
+#endif
+   for (int i = 0; i < DIM; i++)
+      for (int j = 0; j < DIM; j++) {
+         scatter_block ( N, ts, A[i][j], (double *) &Alin[i*ts][j*ts]);
+      }
+}
+
+int main(int argc, char* argv[])
+{
+   char *result[3] = {"n/a","pass","FAIL"};
+	const double eps = pow(2.0, -53);
+
+   // If number of arguments is not 4, print help
+   if ( argc != 4) {
+      printf( "%s: matrix_size block_size check[0|1]?\n", argv[0] );
+      exit( -1 );
+   }
+
+   const int  n = atoi(argv[1]); // matrix size
+   const int ts = atoi(argv[2]); // tile size
+   int check    = atoi(argv[3]); // check result?
+
+   // Compute number of tiles
+   const int nt = n / ts;
+   assert((nt*ts) == n); // tile size should divide size
+
+   // Allocate matrix
+   double * const matrix = (double *) malloc(n * n * sizeof(double));
+   assert(matrix != NULL);
+
+   // Initialize matrix
+   initialize_matrix(n, ts, matrix);
+
+   // Allocate original matrix, and duplicate it, for debugging purposes
+   double * const original_matrix = (double *) malloc(n * n * sizeof(double));
+   assert(original_matrix != NULL);
+
+   // Save a copy of the original matrix
+   for (int i = 0; i < n * n; i++ ) {
+      original_matrix[i] = matrix[i];
+   }
+
+   // Set version description: Indirect blocked matrix
+   const char *version = "I-Blocked matrix";
+
+   // Allocate blocked matrix
+   double *Ah[nt][nt];
+   for (int i = 0; i < nt; i++) {
+      for (int j = 0; j < nt; j++) {
+         Ah[i][j] = malloc(ts * ts * sizeof(double));
+         assert(Ah[i][j] != NULL);
+      }
+   }
+
+   // ---------------------------------------
+   // Convert, compute (time), and re-convert
+   // ---------------------------------------
+   convert_to_blocks(ts, nt, n, (double(*)[n]) matrix, Ah);
+   const float tref = get_time();
+   cholesky(ts, nt, (double* (*)[nt]) Ah);
+   const float time = get_time() - tref;
+   convert_to_linear(ts, nt, n, Ah, (double (*)[n]) matrix);
+
+   // Free blocked matrix
+   for (int i = 0; i < nt; i++) {
+      for (int j = 0; j < nt; j++) {
+         assert(Ah[i][j] != NULL);
+         free(Ah[i][j]);
+      }
+   }
+
+   // Check result, if requested
+   if ( check ) {
+      const char uplo = 'L';
+      if ( check_factorization( n, original_matrix, matrix, n, uplo, eps) ) check++;
+   }
+
+   // Free original matrix, not needed anymore
+   free(original_matrix);
+
+   // Compute GFLOPs (Not verified)
+   float gflops = (((1.0 / 3.0) * n * n * n) / ((time) * 1.0e+9));
+
+   // Print results
+#ifdef VERBOSE
+   printf( "\n" );
+   printf( "============ CHOLESKY RESULTS ============\n" );
+   printf( "  test                  %s\n", argv[0]);
+   printf( "  version               %s\n", version);
+   printf( "  matrix size:          %dx%d\n", n, n);
+   printf( "  tile size:            %dx%d\n", ts, ts);
+   printf( "  number of threads:    %d\n", omp_get_max_threads());
+   printf( "  time (s):             %f\n", time);
+   printf( "  performance (gflops): %f\n", gflops);
+   printf( "  check:                %s\n", result[check]);
+   printf( "==========================================\n" );
+#else
+   printf("test, %s, version, %s, n, %d, ts, %d, num_threads, %d, gflops, %f, time, %f, check, %s\n", argv[0], version, n, ts, omp_get_max_threads(), gflops, time, result[check]);
+#endif
+
+   // Free matrix
+   free(matrix);
+
+   return 0;
+}
+
--- a/03-cut-offs/Makefile
+++ b/03-cut-offs/Makefile
@ -0,0 +1,14 @@
+ALL_RECURSIVE=all-recursive clean-recursive 
+
+DIRS=$(shell ls -d */ | sed '/_/d' 2>/dev/null | sed '/bin/d' 2>/dev/null)
+
+all:   all-recursive
+clean: clean-recursive
+
+$(ALL_RECURSIVE):
+	@failcom='exit 1';\
+   target=`echo $@ | sed s/-recursive//`; \
+   for subdir in $(DIRS); do \
+      (cd $$subdir && $(MAKE) $$target) || eval $$failcom; \
+   done;
+
--- a/03-cut-offs/bin/.gitdir
+++ b/03-cut-offs/bin/.gitdir
--- a/03-cut-offs/bin/run-mergesort.sh
+++ b/03-cut-offs/bin/run-mergesort.sh
@ -0,0 +1,24 @@
+#!/bin/bash
+#SBATCH --job-name=mergesort
+#SBATCH --workdir=.
+#SBATCH --output=mergesort_%j.out
+#SBATCH --error=mergesort_%j.err
+#SBATCH --cpus-per-task=48
+#SBATCH --ntasks=1
+#SBATCH --time=00:30:00
+#SBATCH --qos=debug
+
+export IFS=";"
+
+THREADS="01;02;04;06;08;10;12;14;16;18;20;22;24;26;28;30;32;34;36;38;40;42;44;46;48"
+#THREADS="01;02;04;06;08;10;12"
+
+ASIZES="1000000"
+
+for AS in $ASIZES; do
+  for threads in $THREADS; do
+    echo Running mergesort-co with $threads
+    OMP_NUM_THREADS=$threads ./mergesort-co $AS 
+    echo -- 
+  done
+done
--- a/03-cut-offs/bin/run-sudoku.sh
+++ b/03-cut-offs/bin/run-sudoku.sh
@ -0,0 +1,20 @@
+#!/bin/bash
+#SBATCH --job-name=sudoku
+#SBATCH --workdir=.
+#SBATCH --output=sudoku_%j.out
+#SBATCH --error=sudoku_%j.err
+#SBATCH --cpus-per-task=48
+#SBATCH --ntasks=1
+#SBATCH --time=00:30:00
+#SBATCH --qos=debug
+
+export IFS=";"
+
+THREADS="01;02;04;06;08;10;12;14;16;18;20;22;24;26;28;30;32;34;36;38;40;42;44;46;48"
+#THREADS="01;02;04;06;08;10;12"
+
+for threads in $THREADS; do
+  #OMP_NUM_THREADS=$threads ./sudoku-co 9 3 sudoku-9x9-1.txt 
+  OMP_NUM_THREADS=$threads ./sudoku-co 16 4 sudoku-16x16-1.txt 
+  #OMP_NUM_THREADS=$threads ./sudoku-co 25 5 sudoku-25x25-1.txt 
+done
--- a/03-cut-offs/bin/sudoku-16x16-1.txt
+++ b/03-cut-offs/bin/sudoku-16x16-1.txt
@ -0,0 +1,16 @@
+0 6 0 0 0 0 0 8 11 0 0 15 14 0 0 16
+15 11 0 0 0 16 14 0 0 0 12 0 0 6 0 0
+13 0 9 12 0 0 0 0 3 16 14 0 15 11 10 0
+2 0 16 0 11 0 15 10 1 0 0 0 0 0 0 0
+0 15 11 10 0 0 16 2 13 8 9 12 0 0 0 0
+12 13 0 0 4 1 5 6 2 3 0 0 0 0 11 10
+5 0 6 1 12 0 9 0 15 11 10 7 16 0 0 3
+0 2 0 0 0 10 0 11 6 0 5 0 0 13 0 9
+10 7 15 11 16 0 0 0 12 13 0 0 0 0 0 6
+9 0 0 0 0 0 1 0 0 2 0 16 10 0 0 11
+1 0 4 6 9 13 0 0 7 0 11 0 3 16 0 0
+16 14 0 0 7 0 10 15 4 6 1 0 0 0 13 8
+11 10 0 15 0 0 0 16 9 12 13 0 0 1 5 4
+0 0 12 0 1 4 6 0 16 0 0 0 11 10 0 0
+0 0 5 0 8 12 13 0 10 0 0 11 2 0 0 14
+3 16 0 0 10 0 0 7 0 0 6 0 0 0 12 0
--- a/03-cut-offs/bin/sudoku-25x25-1.txt
+++ b/03-cut-offs/bin/sudoku-25x25-1.txt
@ -0,0 +1,25 @@
+0 8 0 18 0 12 20 21 16 0 0 0 0 0 0 15 0 0 24 5 7 0 19 0 0
+0 0 15 9 5 4 0 23 0 18 12 20 0 21 16 7 0 0 0 2 1 0 0 0 0
+0 0 0 0 14 7 0 0 0 0 0 24 0 0 0 0 10 0 0 16 0 0 23 18 6
+21 20 12 0 16 0 0 25 14 22 7 0 0 19 2 4 0 23 8 6 0 24 0 0 0
+19 0 7 0 2 15 0 0 0 9 0 0 18 0 0 0 0 0 0 0 12 0 21 0 16
+0 23 6 0 0 16 0 0 0 20 14 25 11 0 0 5 24 7 3 9 2 19 1 0 17
+4 21 0 20 0 0 0 12 22 11 2 19 0 0 0 0 0 15 23 0 0 3 7 0 9
+12 25 14 0 22 0 19 0 0 13 0 0 24 7 0 0 0 0 21 10 6 0 15 8 18
+7 0 0 24 9 6 0 15 18 0 0 0 0 0 10 0 13 1 19 0 0 25 0 11 22
+0 19 2 13 17 5 0 0 9 0 6 0 8 15 0 14 11 12 25 22 16 0 0 0 10
+0 0 0 1 0 0 0 0 3 7 8 0 15 0 0 11 12 10 0 0 0 0 18 4 0
+10 16 11 12 25 0 14 22 19 1 0 0 0 17 3 20 0 0 6 21 8 0 0 0 0
+0 0 8 0 23 20 0 18 21 4 11 16 0 0 25 0 7 0 2 0 0 14 22 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 22 19 8 15 0 5 23 24 0 17 0 3
+0 2 0 7 3 8 0 9 23 15 20 0 4 0 21 13 1 0 14 19 0 0 10 12 25
+2 0 0 0 0 18 15 5 0 23 0 0 21 6 20 17 0 14 1 13 0 12 0 25 11
+0 1 0 19 13 9 7 0 24 0 0 0 23 0 8 0 25 0 0 11 0 0 0 21 0
+0 0 10 0 0 0 0 0 0 0 17 1 0 14 0 18 0 5 15 8 9 0 0 0 0
+16 12 0 0 11 0 0 14 0 0 0 0 3 2 24 0 21 0 4 20 0 15 0 0 8
+0 0 0 0 0 10 0 0 0 0 0 12 0 0 11 9 3 0 0 0 17 0 14  19 13
+24 0 0 0 15 21 18 0 4 0 0 0 16 20 0 3 2 13 0 7 19 22 0 0 1
+0 18 0 0 0 0 10 0 0 16 0 22 0 0 0 23 0 24 9 0 3 17 0 0 0
+0 0 0 0 0 0 0 0 15 5 0 0 6 8 4 0 14 11 22 1 25 0 20 16 12
+0 22 19 14 1 3 0 0 0 0 23 9 5 0 15 25 0 20 10 12 21 0 8 0 4
+20 0 25 16 0 0 22 11 1 0 3 0 2 13 7 21 0 0 0 0 0 9 0 0 0
--- a/03-cut-offs/bin/sudoku-9x9-1.txt
+++ b/03-cut-offs/bin/sudoku-9x9-1.txt
@ -0,0 +1,10 @@
+5 3 0 0 7 0 0 0 0
+6 0 0 1 9 5 3 0 0
+0 9 8 0 0 0 0 6 0
+8 0 0 0 6 0 0 0 3
+4 0 0 8 0 3 0 0 1
+7 0 0 0 2 0 0 0 6
+0 6 0 0 0 0 2 8 0
+0 0 0 4 1 9 0 0 5
+0 0 0 0 8 0 0 7 9
+
--- a/03-cut-offs/mergesort-co/Makefile
+++ b/03-cut-offs/mergesort-co/Makefile
@ -0,0 +1,20 @@
+BIN_DIR=../bin
+PROGRAM=$(BIN_DIR)/mergesort-co
+
+CC=g++
+CFLAGS=-O3 -fopenmp
+LDFLAGS=-fopenmp
+
+all: $(PROGRAM) 
+
+$(PROGRAM): mergesort.cpp
+	$(CC) $(CFLAGS) $(VERBOSE) -o $@ $^ $(LDFLAGS)
+
+$(BIN_DIR): 
+	mkdir $@
+
+clean:
+	rm -rf $(PROGRAM) *.o
+
+wipe: clean
+	rm -rf *.out *.err
--- a/03-cut-offs/mergesort-co/mergesort-solved.cpp
+++ b/03-cut-offs/mergesort-co/mergesort-solved.cpp
@ -0,0 +1,272 @@
+/*
+*  This file is part of Christian's OpenMP software lab 
+*
+*  Copyright (C) 2016 by Christian Terboven <terboven@itc.rwth-aachen.de>
+*  Copyright (C) 2016 by Jonas Hahnfeld <hahnfeld@itc.rwth-aachen.de>
+*
+*  This program is free software; you can redistribute it and/or modify
+*  it under the terms of the GNU General Public License as published by
+*  the Free Software Foundation; either version 2 of the License, or
+*  (at your option) any later version.
+*
+*  This program is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*  GNU General Public License for more details.
+*
+*  You should have received a copy of the GNU General Public License
+*  along with this program; if not, write to the Free Software
+*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+*
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <sys/time.h>
+
+#include <iostream>
+#include <algorithm>
+
+#include <cstdlib>
+#include <cstdio>
+
+#include <cmath>
+#include <ctime>
+#include <cstring>
+
+#include <omp.h>
+
+
+/**
+  * helper routine: check if array is sorted correctly
+  */
+bool isSorted(int ref[], int data[], const size_t size){
+	std::sort(ref, ref + size);
+	for (size_t idx = 0; idx < size; ++idx){
+		if (ref[idx] != data[idx]) {
+			return false;
+		}
+	}
+	return true;
+}
+
+
+/**
+  * sequential merge step (straight-forward implementation)
+  */
+void MsMergeSequential(int *out, int *in, long begin1, long end1, long begin2, long end2, long outBegin) {
+	long left = begin1;
+	long right = begin2;
+
+	long idx = outBegin;
+
+	while (left < end1 && right < end2) {
+		if (in[left] <= in[right]) {
+			out[idx] = in[left];
+			left++;
+		} else {
+			out[idx] = in[right];
+			right++;
+		}
+		idx++;
+	}
+
+	while (left < end1) {
+		out[idx] = in[left];
+		left++, idx++;
+	}
+
+	while (right < end2) {
+		out[idx] = in[right];
+		right++, idx++;
+	}
+}
+
+
+/**
+  * sequential MergeSort
+  */
+void MsSequential(int *array, int *tmp, bool inplace, long begin, long end) {
+	if (begin < (end - 1)) {
+		const long half = (begin + end) / 2;
+		MsSequential(array, tmp, !inplace, begin, half);
+		MsSequential(array, tmp, !inplace, half, end);
+		if (inplace) {
+			MsMergeSequential(array, tmp, begin, half, half, end, begin);
+		} else {
+			MsMergeSequential(tmp, array, begin, half, half, end, begin);
+		}
+	} else if (!inplace) {
+		tmp[begin] = array[begin];
+	}
+}
+
+/**
+  * parallel merge step (straight-forward implementation)
+  */
+void MsMergeParallel(int *out, int *in, long begin1, long end1, long begin2, long end2, long outBegin, int deep) {
+	if (deep) {
+		long half1, half2, tmp, count, step;
+		if ((end1 - begin1) < (end2 - begin2)) {
+			half2 = (begin2 + end2) / 2;
+			// find in[half2] in [begin1, end1) (std::upper_bound)
+			half1 = begin1, count = (end1 - begin1);
+			while (count > 0) {
+				step = count / 2;
+				tmp = half1 + step;
+				if (in[tmp] <= in[half2]) {
+					tmp++;
+					half1 = tmp;
+					count -= step + 1;
+				} else {
+					count = step;
+				}
+			}
+		} else {
+			half1 = (begin1 + end1) / 2;
+			// find in[half1] in [begin2, end2) (std::lower_bound)
+			half2 = begin2, count = (end2 - begin2);
+			while (count > 0) {
+				step = count / 2;
+				tmp = half2 + step;
+				if (in[tmp] < in[half1]) {
+					tmp++;
+					half2 = tmp;
+					count -= step + 1;
+				} else {
+					count = step;
+				}
+			}
+		}
+
+		#pragma omp task default(shared)
+		{
+			MsMergeParallel(out, in, begin1, half1, begin2, half2, outBegin, deep - 1);
+		}
+
+		long outBegin2 = outBegin + (half1 - begin1) + (half2 - begin2);
+		#pragma omp task default(shared)
+		{
+			MsMergeParallel(out, in, half1, end1, half2, end2, outBegin2, deep - 1);
+		}
+
+		#pragma omp taskwait
+	} else {
+		MsMergeSequential(out, in, begin1, end1, begin2, end2, outBegin);
+	}
+}
+
+/**
+  * OpenMP Task-parallel MergeSort
+  */
+void MsParallel(int *array, int *tmp, bool inplace, long begin, long end, int deep) {
+	if (begin < (end - 1)) {
+		long half = (begin + end) / 2;
+		if (deep){
+			#pragma omp task default(shared)
+			{
+				MsParallel(array, tmp, !inplace, begin, half, deep - 1);
+			}
+
+			#pragma omp task default(shared)
+			{
+				MsParallel(array, tmp, !inplace, half, end, deep - 1);
+			}
+
+			#pragma omp taskwait
+		}
+		else {
+			MsSequential(array, tmp, !inplace, begin, half);
+			MsSequential(array, tmp, !inplace, half, end);
+		}
+
+		if (inplace) {
+			MsMergeParallel(array, tmp, begin, half, half, end, begin, deep);
+		} else {
+			MsMergeParallel(tmp, array, begin, half, half, end, begin, deep);
+		}
+	} else if (!inplace) {
+		tmp[begin] = array[begin];
+	}
+}
+
+
+/**
+  * OpenMP Task-parallel MergeSort
+  * startup routine containing the Parallel Region
+  */
+void MsParallelOmp(int *array, int *tmp, const size_t size) {
+
+	// compute cut-off recursion level
+	const int iMinTask = (omp_get_max_threads() * 5);
+	int deep = 0;
+	while ((1 << deep) < iMinTask) deep += 1;
+
+#pragma omp parallel
+#pragma omp master
+	{
+		MsParallel(array, tmp, true, 0, size, deep);
+	}
+}
+
+/** 
+  * @brief program entry point
+  */
+int main(int argc, char* argv[]) {
+	// variables to measure the elapsed time
+	struct timeval t1, t2;
+	double etime;
+
+	// expect one command line arguments: array size
+	if (argc != 2) {
+		printf("Usage: MergeSort.exe <array size> \n");
+		printf("\n");
+		return EXIT_FAILURE;
+	}
+	else {
+		const size_t stSize = strtol(argv[1], NULL, 10);
+		int *data = (int*) malloc(stSize * sizeof(int));
+		int *tmp = (int*) malloc(stSize * sizeof(int));
+		int *ref = (int*) malloc(stSize * sizeof(int));
+
+		// first touch
+		#pragma omp parallel for
+		for (size_t idx = 0; idx < stSize; ++idx){
+			data[idx] = 0;
+			tmp[idx] = 0;
+		}
+
+		printf("Initialization...\n");
+
+		srand(95);
+		for (size_t idx = 0; idx < stSize; ++idx){
+			data[idx] = (int) (stSize * (double(rand()) / RAND_MAX));
+		}
+		std::copy(data, data + stSize, ref);
+
+		double dSize = (stSize * sizeof(int)) / 1024 / 1024;
+		printf("Sorting %zu elements of type int (%f MiB)...\n", stSize, dSize);
+
+		gettimeofday(&t1, NULL);
+		MsParallelOmp(data, tmp, stSize);
+		gettimeofday(&t2, NULL);
+
+		etime = (t2.tv_sec - t1.tv_sec) * 1000 + (t2.tv_usec - t1.tv_usec) / 1000;
+		etime = etime / 1000;
+
+		printf("done, took %f sec. Verification...", etime);
+		if (isSorted(ref, data, stSize)) {
+			printf(" successful.\n");
+		}
+		else {
+			printf(" FAILED.\n");
+		}
+
+		free(data);
+		free(tmp);
+		free(ref);
+	}
+
+	return EXIT_SUCCESS;
+}
--- a/03-cut-offs/mergesort-co/mergesort.cpp
+++ b/03-cut-offs/mergesort-co/mergesort.cpp
@ -0,0 +1,174 @@
+/*
+*  This file is part of Christian's OpenMP software lab 
+*
+*  Copyright (C) 2016 by Christian Terboven <terboven@itc.rwth-aachen.de>
+*  Copyright (C) 2016 by Jonas Hahnfeld <hahnfeld@itc.rwth-aachen.de>
+*
+*  This program is free software; you can redistribute it and/or modify
+*  it under the terms of the GNU General Public License as published by
+*  the Free Software Foundation; either version 2 of the License, or
+*  (at your option) any later version.
+*
+*  This program is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*  GNU General Public License for more details.
+*
+*  You should have received a copy of the GNU General Public License
+*  along with this program; if not, write to the Free Software
+*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+*
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <sys/time.h>
+
+#include <iostream>
+#include <algorithm>
+
+#include <cstdlib>
+#include <cstdio>
+
+#include <cmath>
+#include <ctime>
+#include <cstring>
+
+
+
+/**
+  * helper routine: check if array is sorted correctly
+  */
+bool isSorted(int ref[], int data[], const size_t size){
+	std::sort(ref, ref + size);
+	for (size_t idx = 0; idx < size; ++idx){
+		if (ref[idx] != data[idx]) {
+			return false;
+		}
+	}
+	return true;
+}
+
+
+/**
+  * sequential merge step (straight-forward implementation)
+  */
+// TODO: cut-off could also apply here (extra parameter?)
+// TODO: optional: we can also break merge in two halves
+void MsMergeSequential(int *out, int *in, long begin1, long end1, long begin2, long end2, long outBegin) {
+	long left = begin1;
+	long right = begin2;
+
+	long idx = outBegin;
+
+	while (left < end1 && right < end2) {
+		if (in[left] <= in[right]) {
+			out[idx] = in[left];
+			left++;
+		} else {
+			out[idx] = in[right];
+			right++;
+		}
+		idx++;
+	}
+
+	while (left < end1) {
+		out[idx] = in[left];
+		left++, idx++;
+	}
+
+	while (right < end2) {
+		out[idx] = in[right];
+		right++, idx++;
+	}
+}
+
+
+/**
+  * sequential MergeSort
+  */
+// TODO: remember one additional parameter (depth)
+// TODO: recursive calls could be taskyfied
+// TODO: task synchronization also is required
+void MsSequential(int *array, int *tmp, bool inplace, long begin, long end) {
+	if (begin < (end - 1)) {
+		const long half = (begin + end) / 2;
+		MsSequential(array, tmp, !inplace, begin, half);
+		MsSequential(array, tmp, !inplace, half, end);
+		if (inplace) {
+			MsMergeSequential(array, tmp, begin, half, half, end, begin);
+		} else {
+			MsMergeSequential(tmp, array, begin, half, half, end, begin);
+		}
+	} else if (!inplace) {
+		tmp[begin] = array[begin];
+	}
+}
+
+
+/**
+  * Serial MergeSort
+  */
+// TODO: this function should create the parallel region
+// TODO: good point to compute a good depth level (cut-off)
+void MsSerial(int *array, int *tmp, const size_t size) {
+
+   // TODO: parallel version of MsSequential will receive one more parameter: 'depth' (used as cut-off)
+	MsSequential(array, tmp, true, 0, size);
+}
+
+
+/** 
+  * @brief program entry point
+  */
+int main(int argc, char* argv[]) {
+	// variables to measure the elapsed time
+	struct timeval t1, t2;
+	double etime;
+
+	// expect one command line arguments: array size
+	if (argc != 2) {
+		printf("Usage: MergeSort.exe <array size> \n");
+		printf("\n");
+		return EXIT_FAILURE;
+	}
+	else {
+		const size_t stSize = strtol(argv[1], NULL, 10);
+		int *data = (int*) malloc(stSize * sizeof(int));
+		int *tmp = (int*) malloc(stSize * sizeof(int));
+		int *ref = (int*) malloc(stSize * sizeof(int));
+
+		printf("Initialization...\n");
+
+		srand(95);
+		for (size_t idx = 0; idx < stSize; ++idx){
+			data[idx] = (int) (stSize * (double(rand()) / RAND_MAX));
+		}
+		std::copy(data, data + stSize, ref);
+
+		double dSize = (stSize * sizeof(int)) / 1024 / 1024;
+		printf("Sorting %zu elements of type int (%f MiB)...\n", stSize, dSize);
+
+		gettimeofday(&t1, NULL);
+		MsSerial(data, tmp, stSize);
+		gettimeofday(&t2, NULL);
+
+		etime = (t2.tv_sec - t1.tv_sec) * 1000 + (t2.tv_usec - t1.tv_usec) / 1000;
+		etime = etime / 1000;
+
+		printf("done, took %f sec. Verification...", etime);
+		if (isSorted(ref, data, stSize)) {
+			printf(" successful.\n");
+		}
+		else {
+			printf(" FAILED.\n");
+		}
+
+		free(data);
+		free(tmp);
+		free(ref);
+	}
+
+	return EXIT_SUCCESS;
+}
--- a/03-cut-offs/sudoku-co/Makefile
+++ b/03-cut-offs/sudoku-co/Makefile
@ -0,0 +1,29 @@
+BIN_DIR=../bin
+PROGRAM=$(BIN_DIR)/sudoku-co
+
+CC=g++
+CFLAGS=-c -O3 -fopenmp -DUSE_SEQUENTIAL_CUTOFF
+LDFLAGS=-fopenmp
+
+OBJS=SudokuBoard.o sudoku.o
+
+all: $(PROGRAM) 
+
+$(PROGRAM): $(OBJS)
+	$(CC) -o $@ $^ $(LDFLAGS)
+
+%.o: %.cpp
+	$(CC) $(CFLAGS) -o $@ $^ 
+
+sudoku.cpp:
+	@cat README.md 
+	@false
+
+$(BIN_DIR): 
+	mkdir $@
+
+clean:
+	rm -rf $(PROGRAM) *.o
+
+wipe: clean
+	rm -rf *.out *.err
--- a/03-cut-offs/sudoku-co/README.md
+++ b/03-cut-offs/sudoku-co/README.md
@ -0,0 +1,5 @@
+====================================================================================================
+Copy your previous version of 01-tasks/sudoku-tsk exercise (sudoku.c) and modify the code in order 
+to use a native OpenMP cut-off mechanism (ie, the 'if' or the 'final' clauses) instead of the
+manual cut-off version implemented as: "USE_SEQUENTIAL_CUTOFF".
+====================================================================================================
--- a/03-cut-offs/sudoku-co/SudokuBoard.cpp
+++ b/03-cut-offs/sudoku-co/SudokuBoard.cpp
@ -0,0 +1,119 @@
+/*                    
+ *  This file is part of Christian's OpenMP Task-parallel Sudoku Solver
+ *  
+ *  Copyright (C) 2013 by Christian Terboven <christian@terboven.com>
+ *                                                                       
+ *  This program is free software; you can redistribute it and/or modify 
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include "SudokuBoard.h"
+
+#include <cstring>
+
+CSudokuBoard::CSudokuBoard(int fsize, int bsize)
+	: field_size(fsize), block_size(bsize), solutions(-1)
+{
+	field = new int[field_size*field_size];
+}
+
+
+CSudokuBoard::CSudokuBoard(const CSudokuBoard& other)
+	: field_size(other.getFieldSize()), block_size(other.getBlockSize()), solutions(other.getNumSolutions())
+{
+	field = new int[field_size*field_size];
+	std::memcpy(field, other.field, sizeof(int) * field_size*field_size);
+}
+
+
+CSudokuBoard::~CSudokuBoard(void)
+{
+	delete[] field;
+}
+
+
+bool CSudokuBoard::loadFromFile(char *filename)
+{
+  std::ifstream ifile(filename);
+  
+  if (!ifile) {
+    std::cout << "There was an error opening the input file " << filename << std::endl;
+    std::cout << std::endl;
+    return false;
+  }
+
+  for (int i = 0; i < this->field_size; i++) {
+    for (int j = 0; j < this->field_size; j++) {
+	  ifile >> this->field[ACCESS(i,j)];
+    }
+  }
+
+  return true;
+}
+
+
+void CSudokuBoard::printBoard()
+{
+	for(int i = 0; i < field_size; i++) {
+		for(int j = 0; j < field_size; j++) {
+			std::cout << std::setw(3) << 
+				this->field[ACCESS(i,j)] 
+				<< " ";
+		}
+		std::cout << std::endl;
+	}
+}
+
+
+bool CSudokuBoard::check(int x, int y, int value)
+{
+	if(checkHorizontal(y, value))
+		return true;
+	if(checkVertical(x, value))
+		return true;
+	if(checkBox(x, y, value))
+		return true;
+	return false;
+}
+
+
+bool CSudokuBoard::checkHorizontal(int y, int value) {
+	for(int i = 0; i < field_size; i++)
+		if(field[ACCESS(y,i)] == value)
+			return true;
+	return false;
+}
+
+
+bool CSudokuBoard::checkVertical(int x, int value) {
+	for(int i = 0; i < field_size; i++)
+		if(field[ACCESS(i,x)] == value)
+			return true;
+	return false;
+}
+
+
+bool CSudokuBoard::checkBox(int x, int y, int value) {
+	// find suitable box edge
+	int x_box = (int)(x / block_size) * block_size;
+	int y_box = (int)(y / block_size) * block_size;
+
+	for(int i = y_box; i < y_box + block_size; i++)
+		for(int j = x_box; j < x_box + block_size; j++)
+			if(field[ACCESS(i,j)] == value)
+				return true;
+  
+	return false;
+}
--- a/03-cut-offs/sudoku-co/SudokuBoard.h
+++ b/03-cut-offs/sudoku-co/SudokuBoard.h
@ -0,0 +1,96 @@
+/*                    
+ *  This file is part of Christian's OpenMP Task-parallel Sudoku Solver
+ *  
+ *  Copyright (C) 2013 by Christian Terboven <christian@terboven.com>
+ *                                                                       
+ *  This program is free software; you can redistribute it and/or modify 
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <vector>
+
+#define ACCESS(x, y) (this->field_size*(x) + (y))
+
+/**
+ * @brief Representation of the Sudoku board, includes solver and utility functions
+ */
+class CSudokuBoard
+{
+public:
+	CSudokuBoard(int fsize, int bsize);
+	CSudokuBoard(const CSudokuBoard& other);
+	~CSudokuBoard(void);
+
+	inline int getNumSolutions() const {
+		return this->solutions;
+	}
+	
+	inline void incrementSolutionCounter() {
+		if (this->solutions == -1)
+			this->solutions = 1;
+		else
+			this->solutions++;
+	}
+	
+	inline int getFieldSize() const {
+		return this->field_size;
+	}
+	
+	inline int getBlockSize() const {
+		return this->block_size;
+	}
+	
+	inline int get(int x, int y) const {
+		return this->field[ACCESS(x,y)];
+	}
+	
+	inline void set(int x, int y, int value) {
+		this->field[ACCESS(x, y)] = value;
+	}
+
+	/**
+	 * Read Sudoku template from file
+	 * @param filename name of file to read input from
+	 * @return true if reading file was successful, false if not
+	 */
+	bool loadFromFile(char *filename);
+
+	/**
+	 * Print the Sudoku board to stdout
+	 */
+	void printBoard();
+
+	/**
+	 * Check whether value already used
+	 * @return true if already used, false if not
+	 */
+	bool check(int x, int y, int value);
+
+
+private:
+	bool checkHorizontal(int y, int value);
+	bool checkVertical(int x, int value);
+	bool checkBox(int x, int y, int value);
+	
+	int field_size;
+	int block_size;
+
+	int *field;
+
+	int solutions;
+};
--- a/03-cut-offs/sudoku-co/sudoku-solved.cpp
+++ b/03-cut-offs/sudoku-co/sudoku-solved.cpp
@ -0,0 +1,177 @@
+/*                    
+ *  This file is part of Christian's OpenMP Task-parallel Sudoku Solver
+ *  
+ *  Copyright (C) 2013 by Christian Terboven <christian@terboven.com>
+ *                                                                       
+ *  This program is free software; you can redistribute it and/or modify 
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include <iostream>
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <omp.h>
+
+#include "SudokuBoard.h"
+
+
+int found_sudokus = 0;
+
+
+/**
+  * Solve the Sudoku puzzle by finding and counting all solutions (if all are requested)
+  */
+bool solve(int x, int y, CSudokuBoard* sudoku, bool printAnyFoundSolution = true, bool findAllSolutions = true)
+{
+	if(x == sudoku->getFieldSize()) {                 // end of line
+		y++;
+		x = 0;
+		if(y == sudoku->getFieldSize())           // end
+			return true;
+	}   
+  
+	if(sudoku->get(y, x) > 0)                         // field already set
+		return solve(x+1, y, sudoku, printAnyFoundSolution, findAllSolutions);    // tackle next field
+    
+	for(int i = 1; i <= sudoku->getFieldSize(); i++) {            // try all numbers
+		if(!sudoku->check(x, y, i)) {
+			sudoku->set(y, x, i);                  // if number fits, set it
+			if(solve(x+1, y, sudoku, printAnyFoundSolution, findAllSolutions)) {  // tackle next field
+#pragma omp atomic
+				found_sudokus++;
+				sudoku->incrementSolutionCounter();     // solution found :-)
+				if (printAnyFoundSolution) {
+					std::cout << "The following is a valid solution:" << std::endl;
+					sudoku->printBoard();
+					std::cout << std::endl;
+				}
+				if (!findAllSolutions) {
+					return true;                // return, as only one solution was asked for
+				}
+			}
+		}
+	}
+	
+	sudoku->set(y, x, 0);                            // no solution found, reset field
+	return false;
+}
+
+
+
+/**
+  * Solve the Sudoku puzzle in parallel by finding and counting all solutions
+  */
+bool solve_parallel(int x, int y, CSudokuBoard* sudoku, bool printAnyFoundSolution = true)
+{
+	if (x == sudoku->getFieldSize()) {                       // end of line
+		y++;
+		x = 0;
+		if(y == sudoku->getFieldSize())                     // end
+			return true;
+	}   
+  
+	if (sudoku->get(y, x) > 0) {                         // field already set
+		return solve_parallel(x+1, y, sudoku, printAnyFoundSolution);    // tackle next field
+	}
+    
+//#if USE_SEQUENTIAL_CUTOFF
+//	if ( y > 1 ) {
+//		return solve(x, y, sudoku, printAnyFoundSolution);
+//	}
+//#endif
+	
+	for (int i = 1; i <= sudoku->getFieldSize(); i++) {      // try all numbers
+		if (!sudoku->check(x, y, i)) {
+#pragma omp task firstprivate(i,x,y,sudoku) final(y > 0)
+			{
+				CSudokuBoard new_sudoku(*sudoku);
+				new_sudoku.set(y, x, i);                    // if number fits, set it
+				
+				if (solve_parallel(x+1, y, &new_sudoku, printAnyFoundSolution)) {  // tackle next field
+#pragma omp atomic
+					found_sudokus++;
+	//				sudoku->incrementSolutionCounter();     // solution found :-)
+					if (printAnyFoundSolution) {
+						std::cout << "The following is a valid solution:" << std::endl;
+						new_sudoku.printBoard();
+						std::cout << std::endl;
+					}
+				}
+			}
+		}
+	}
+	
+#pragma omp taskwait
+	sudoku->set(y, x, 0);                            // no solution found, reset field
+	return false;
+}
+
+
+
+/** @brief program entry point
+ */
+int main(int argc, char* argv[]) {
+	// measure the time
+	double t3, t4;
+	int nthreads = 0;
+
+	// expect three command line arguments: field size, block size, and input file
+	if (argc != 4) {
+		std::cout << "Usage: sudoku.exe <field size> <block size> <input filename>" << std::endl;
+		std::cout << std::endl;
+		return -1;
+	}
+	else {
+		CSudokuBoard *sudoku1 = new CSudokuBoard(atoi(argv[1]), atoi(argv[2]));
+		if (!sudoku1->loadFromFile(argv[3])) {
+			std::cout << "There was an error reading a Sudoku template from " << argv[3] << std::endl;
+			std::cout << std::endl;
+			return -1;
+		}
+
+		// print the Sudoku board template
+		std::cout << "Given Sudoku template" << std::endl;
+		sudoku1->printBoard();
+
+		// solve the Sudoku by finding (and printing) all solutions
+		t3 = omp_get_wtime();
+      #pragma omp parallel sections
+		{
+			solve_parallel(0, 0, sudoku1, false);
+		}		
+		t4 = omp_get_wtime();
+
+#pragma omp parallel 
+		{
+#pragma omp master
+			nthreads = omp_get_num_threads();
+		}
+
+		std::cout << std::endl;
+		std::cout << "In total there were " << sudoku1->getNumSolutions() << " solutions found in serial." << std::endl;
+		std::cout << "In total there were " << found_sudokus << " solutions found in parallel " << std::endl;
+		std::cout << std::endl;
+		
+		delete sudoku1;
+	}
+	
+	// print the time
+	std::cout << "Parallel computation took " << t4 - t3 << " seconds (" 
+			<< nthreads << " threads)." << std::endl << std::endl;
+
+	return 0;
+}
--- a/04-matmul/Makefile
+++ b/04-matmul/Makefile
@ -0,0 +1,23 @@
+BIN_DIR=.
+PROGRAM=$(BIN_DIR)/matmul
+
+MKL_HOME=$(MKLROOT)
+
+CC=gcc
+CFLAGS=-O3 -std=gnu99 -fopenmp
+LDFLAGS=-fopenmp
+#VERBOSE=-DVERBOSE
+
+all: $(PROGRAM) 
+
+$(PROGRAM): matmul.c 
+	$(CC) $(CFLAGS) $(VERBOSE) -o $@ $^ $(LDFLAGS)
+
+$(BIN_DIR): 
+	mkdir $@
+
+clean:
+	rm -rf $(PROGRAM) *.o
+
+wipe: clean
+	rm -rf *.out *.err
--- a/04-matmul/matmul.c
+++ b/04-matmul/matmul.c
@ -0,0 +1,174 @@
+//===-- matmul.c - Different implementations of matrix multiplies -*- C -*-===//
+//
+// Part of the LOMP Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <omp.h>
+
+#define N 3072
+
+#define DUMP_MATRIX 0
+#define DUMP_TASKS  0
+
+void matmul_seq(float * C, float * A, float * B, size_t n) {
+    for (size_t i = 0; i < n; ++i) {
+        for (size_t k = 0; k < n; ++k) {
+            for (size_t j = 0; j < n; ++j) {
+                C[i * n + j] += A[i * n + k] * B[k * n + j];
+            }
+        }
+    }
+}
+
+void matmul_par(float * C, float * A, float * B, size_t n) {
+#pragma omp parallel for schedule(static,8) firstprivate(n)
+    for (size_t i = 0; i < n; ++i) {
+        for (size_t k = 0; k < n; ++k) {
+            for (size_t j = 0; j < n; ++j) {
+                C[i * n + j] += A[i * n + k] * B[k * n + j];
+            }
+        }
+    }
+}
+
+void matmul_tloop(float * C, float * A, float * B, size_t n) {
+#pragma omp parallel firstprivate(n)
+#pragma omp single nowait
+#pragma omp taskloop
+    for (size_t i = 0; i < n; ++i) {
+        for (size_t k = 0; k < n; ++k) {
+            for (size_t j = 0; j < n; ++j) {
+                C[i * n + j] += A[i * n + k] * B[k * n + j];
+            }
+        }
+    }
+}
+
+void matmul_task(float * C, float * A, float * B, size_t n) {
+    const size_t bf = 256;
+    if (n % bf != 0) {
+        printf("Blocking factor does not divide matrix size!\n");
+        exit(EXIT_FAILURE);
+    }
+#pragma omp parallel firstprivate(n, bf)
+#pragma omp master
+    {
+        // work on the blocks of the matrix
+        for (size_t ib = 0; ib < n; ib += bf)
+            for (size_t kb = 0; kb < n; kb += bf)
+                for (size_t jb = 0; jb < n; jb += bf) {
+#pragma omp task firstprivate(ib, kb, jb) firstprivate(n, bf) \
+                 depend(inout:C[ib * n + jb:bf]) \
+                 depend(in:A[ib * n + kb:bf]) \
+                 depend(in:B[kb * n + jb:bf])
+                    {
+#if DUMP_TASKS
+                        printf("task: C[%ld,%ld] += A[%ld,%ld] o B[%ld,%ld]\n", ib, jb, ib, kb, kb, jb);
+#endif                        
+                        // work on a single block``
+                        for (size_t i = ib; i < (ib + bf); ++i)
+                            for (size_t k = kb; k < (kb + bf); ++k)
+                                for (size_t j = jb; j < (jb + bf); ++j)
+                                    C[i * n + j] += A[i * n + k] * B[k * n + j];
+                    }
+                }
+    }
+}
+
+void init_mat(float * C, float * A, float * B, size_t n) {
+    for (size_t i = 0; i < n; ++i) {
+        for (size_t j = 0; j < n; ++j) {
+            C[i * n + j] = 0.0;
+            A[i * n + j] = 0.5;
+            B[i * n + j] = 0.25;
+        }
+    }
+}
+
+void dump_mat(float * mtx, size_t n) {
+    for (size_t i = 0; i < n; ++i) {
+        for (size_t j = 0; j < n; ++j) {
+            printf("%f ", mtx[i * n + j]);
+        }
+        printf("\n");
+    }
+}
+
+float sum_mat(float * mtx, size_t n) {
+    float sum = 0.0;
+    for (size_t i = 0; i < n; ++i) {
+        for (size_t j = 0; j < n; ++j) {
+            sum += mtx[i * n + j];
+        }
+    }
+    return sum;
+}
+
+int main(int argc, char * argv[]) {
+    double ts, te;
+    double t_seq;
+
+    float * C;
+    float * A;
+    float * B;
+
+    // If number of arguments is not 1, print help
+    if (argc != 2) {
+        printf("%s: matrix_size\n", argv[0]);
+        return EXIT_FAILURE;
+    } 
+    const int  n = atoi(argv[1]); // matrix size
+
+    C = (float *) malloc(sizeof(*C) * n * n);
+    A = (float *) malloc(sizeof(*A) * n * n);
+    B = (float *) malloc(sizeof(*B) * n * n);
+
+    init_mat(C, A, B, n);
+    ts = omp_get_wtime();
+    matmul_seq(C, A, B, n);
+    te = omp_get_wtime();
+#if DUMP_MATRIX
+    dump_mat(C, n);
+#endif
+    t_seq = te - ts;
+    printf("Sum of matrix (serial):   %f, wall time %lf, speed-up %.2lf\n",
+           sum_mat(C, n), (te-ts), t_seq / (te-ts));
+
+    init_mat(C, A, B, n);
+    ts = omp_get_wtime();
+    matmul_par(C, A, B, n);
+    te = omp_get_wtime();
+#if DUMP_MATRIX
+    dump_mat(C, n);
+#endif
+    printf("Sum of matrix (parallel): %f, wall time %lf, speed-up %.2lf\n",
+           sum_mat(C, n), (te-ts), t_seq / (te-ts));
+
+    init_mat(C, A, B, n);
+    ts = omp_get_wtime();
+    matmul_tloop(C, A, B, n);
+    te = omp_get_wtime();
+#if DUMP_MATRIX
+    dump_mat(C, n);
+#endif
+    printf("Sum of matrix (taskloop): %f, wall time %lf, speed-up %.2lf\n",
+           sum_mat(C, n), (te-ts), t_seq / (te-ts));
+
+    init_mat(C, A, B, n);
+    ts = omp_get_wtime();
+    matmul_task(C, A, B, n);
+    te = omp_get_wtime();
+#if DUMP_MATRIX
+    dump_mat(C, n);
+#endif
+    printf("Sum of matrix (tasks):    %f, wall time %lf, speed-up %.2lf\n",
+           sum_mat(C, n), (te-ts), t_seq / (te-ts));
+
+    return EXIT_SUCCESS;
+}
--- a/04-matmul/solution/matmul.c
+++ b/04-matmul/solution/matmul.c
@ -0,0 +1,165 @@
+//===-- matmul.c - Different implementations of matrix multiplies -*- C -*-===//
+//
+// Part of the LOMP Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <omp.h>
+
+#define N 3072
+
+#define DUMP_MATRIX 0
+#define DUMP_TASKS  0
+
+void matmul_seq(float * C, float * A, float * B, size_t n) {
+    for (size_t i = 0; i < n; ++i) {
+        for (size_t k = 0; k < n; ++k) {
+            for (size_t j = 0; j < n; ++j) {
+                C[i * n + j] += A[i * n + k] * B[k * n + j];
+            }
+        }
+    }
+}
+
+void matmul_par(float * C, float * A, float * B, size_t n) {
+#pragma omp parallel for schedule(static,8) firstprivate(n)
+    for (size_t i = 0; i < n; ++i) {
+        for (size_t k = 0; k < n; ++k) {
+            for (size_t j = 0; j < n; ++j) {
+                C[i * n + j] += A[i * n + k] * B[k * n + j];
+            }
+        }
+    }
+}
+
+void matmul_tloop(float * C, float * A, float * B, size_t n) {
+    for (size_t i = 0; i < n; ++i) {
+        for (size_t k = 0; k < n; ++k) {
+            for (size_t j = 0; j < n; ++j) {
+                C[i * n + j] += A[i * n + k] * B[k * n + j];
+            }
+        }
+    }
+}
+
+void matmul_task(float * C, float * A, float * B, size_t n) {
+    const size_t bf = 256;
+    if (n % bf != 0) {
+        printf("Blocking factor does not divide matrix size!\n");
+        exit(EXIT_FAILURE);
+    }
+    {
+        // work on the blocks of the matrix
+        for (size_t ib = 0; ib < n; ib += bf)
+            for (size_t kb = 0; kb < n; kb += bf)
+                for (size_t jb = 0; jb < n; jb += bf) {
+                    {
+#if DUMP_TASKS
+                        printf("task: C[%ld,%ld] += A[%ld,%ld] o B[%ld,%ld]\n", ib, jb, ib, kb, kb, jb);
+#endif                        
+                        // work on a single block``
+                        for (size_t i = ib; i < (ib + bf); ++i)
+                            for (size_t k = kb; k < (kb + bf); ++k)
+                                for (size_t j = jb; j < (jb + bf); ++j)
+                                    C[i * n + j] += A[i * n + k] * B[k * n + j];
+                    }
+                }
+    }
+}
+
+void init_mat(float * C, float * A, float * B, size_t n) {
+    for (size_t i = 0; i < n; ++i) {
+        for (size_t j = 0; j < n; ++j) {
+            C[i * n + j] = 0.0;
+            A[i * n + j] = 0.5;
+            B[i * n + j] = 0.25;
+        }
+    }
+}
+
+void dump_mat(float * mtx, size_t n) {
+    for (size_t i = 0; i < n; ++i) {
+        for (size_t j = 0; j < n; ++j) {
+            printf("%f ", mtx[i * n + j]);
+        }
+        printf("\n");
+    }
+}
+
+float sum_mat(float * mtx, size_t n) {
+    float sum = 0.0;
+    for (size_t i = 0; i < n; ++i) {
+        for (size_t j = 0; j < n; ++j) {
+            sum += mtx[i * n + j];
+        }
+    }
+    return sum;
+}
+
+int main(int argc, char * argv[]) {
+    double ts, te;
+    double t_seq;
+
+    float * C;
+    float * A;
+    float * B;
+
+    // If number of arguments is not 1, print help
+    if (argc != 2) {
+        printf("%s: matrix_size\n", argv[0]);
+        return EXIT_FAILURE;
+    } 
+    const int  n = atoi(argv[1]); // matrix size
+
+    C = (float *) malloc(sizeof(*C) * n * n);
+    A = (float *) malloc(sizeof(*A) * n * n);
+    B = (float *) malloc(sizeof(*B) * n * n);
+
+    init_mat(C, A, B, n);
+    ts = omp_get_wtime();
+    matmul_seq(C, A, B, n);
+    te = omp_get_wtime();
+#if DUMP_MATRIX
+    dump_mat(C, n);
+#endif
+    t_seq = te - ts;
+    printf("Sum of matrix (serial):   %f, wall time %lf, speed-up %.2lf\n",
+           sum_mat(C, n), (te-ts), t_seq / (te-ts));
+
+    init_mat(C, A, B, n);
+    ts = omp_get_wtime();
+    matmul_par(C, A, B, n);
+    te = omp_get_wtime();
+#if DUMP_MATRIX
+    dump_mat(C, n);
+#endif
+    printf("Sum of matrix (parallel): %f, wall time %lf, speed-up %.2lf\n",
+           sum_mat(C, n), (te-ts), t_seq / (te-ts));
+
+    init_mat(C, A, B, n);
+    ts = omp_get_wtime();
+    matmul_tloop(C, A, B, n);
+    te = omp_get_wtime();
+#if DUMP_MATRIX
+    dump_mat(C, n);
+#endif
+    printf("Sum of matrix (taskloop): %f, wall time %lf, speed-up %.2lf\n",
+           sum_mat(C, n), (te-ts), t_seq / (te-ts));
+
+    init_mat(C, A, B, n);
+    ts = omp_get_wtime();
+    matmul_task(C, A, B, n);
+    te = omp_get_wtime();
+#if DUMP_MATRIX
+    dump_mat(C, n);
+#endif
+    printf("Sum of matrix (tasks):    %f, wall time %lf, speed-up %.2lf\n",
+           sum_mat(C, n), (te-ts), t_seq / (te-ts));
+
+    return EXIT_SUCCESS;
+}
--- a/07-fibonacci/Makefile
+++ b/07-fibonacci/Makefile
@ -0,0 +1,20 @@
+PROG=fibonacci
+
+CC=gcc
+CCFLAGS=-O3 -fopenmp
+LDFLAGS=-fopenmp
+
+debug:
+	$(CC) $(CCFLAGS) -g -O0 ${PROG}.c -o ${PROG}.exe
+
+release:
+	$(CC) $(CCFLAGS) -O3 ${PROG}.c -o ${PROG}.exe
+
+solution:
+	$(CC) $(CCFLAGS) -O3 ${PROG}-solved.c -o ${PROG}-solved.exe
+
+run:
+	./${PROG}.exe < input
+
+clean:
+	rm -f ${PROG}.exe ${PROG}-solved.exe  ${PROG}.o *~
--- a/07-fibonacci/fibonacci-solved.c
+++ b/07-fibonacci/fibonacci-solved.c
@ -0,0 +1,79 @@
+/*
+ *  OpenMP lecture exercises
+ *  Copyright (C) 2011 by Christian Terboven <terboven@rz.rwth-aachen.de>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include <stdio.h>
+#include <omp.h>
+
+int ser_fib(int n)
+{
+  int x, y;
+  if (n < 2)
+    return n;
+
+  x = ser_fib(n - 1);
+
+  y = ser_fib(n - 2);
+
+  return x+y;
+}
+
+int fib(int n)
+{
+  int x, y;
+  if (n < 2)
+    return n;
+  else if (n < 30)
+    return ser_fib(n);
+
+  #pragma omp task shared(x)
+  {
+    x = fib(n - 1);
+  }
+
+  #pragma omp task shared(y)
+  {
+    y = fib(n - 2);
+  }
+
+  #pragma omp taskwait
+
+  return x+y;
+
+}
+
+
+int main()
+{
+  int n,fibonacci;
+  double starttime;
+  printf("Please insert n, to calculate fib(n): \n");
+  scanf("%d",&n);
+  starttime=omp_get_wtime();
+
+  #pragma omp parallel
+  #pragma omp single
+  {
+    fibonacci=fib(n);
+  }
+
+  printf("fib(%d)=%d \n",n,fibonacci);
+  printf("calculation took %lf sec\n",omp_get_wtime()-starttime);
+  return 0;
+}
--- a/07-fibonacci/fibonacci.c
+++ b/07-fibonacci/fibonacci.c
@ -0,0 +1,53 @@
+/*
+ *  OpenMP lecture exercises
+ *  Copyright (C) 2011 by Christian Terboven <terboven@rz.rwth-aachen.de>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include <stdio.h>
+#include <omp.h>
+
+
+int fib(int n)
+{
+  int x, y;
+  if (n < 2)
+    return n;
+
+  x = fib(n - 1);
+
+  y = fib(n - 2);
+
+  return x+y;
+
+}
+
+
+int main()
+{
+  int n,fibonacci;
+  double starttime;
+  printf("Please insert n, to calculate fib(n): \n");
+  scanf("%d",&n);
+  starttime=omp_get_wtime();
+
+  fibonacci=fib(n);
+
+  printf("fib(%d)=%d \n",n,fibonacci);
+  printf("calculation took %lf sec\n",omp_get_wtime()-starttime);
+  return 0;
+}
--- a/07-fibonacci/input
+++ b/07-fibonacci/input
@ -0,0 +1 @@
+35
--- a/08-quicksort/Makefile
+++ b/08-quicksort/Makefile
@ -0,0 +1,20 @@
+PROG=quicksort
+
+CXX = g++
+CXXFLAGS=-O3 -fopenmp
+LDFLAGS=-fopenmp
+
+debug:
+	$(CXX) $(CXXFLAGS) -g -O0 ${PROG}.cpp -o ${PROG}.exe
+
+release:
+	$(CXX) $(CXXFLAGS) -O3 ${PROG}.cpp -o ${PROG}.exe
+
+solution:
+	$(CXX) $(CXXFLAGS) -O3 ${PROG}-solved.cpp -o ${PROG}-solved.exe
+
+run:
+	./${PROG}.exe
+
+clean:
+	rm -f ${PROG}.exe ${PROG}-solved.exe ${PROG}.o *~
--- a/08-quicksort/quicksort-solved.cpp
+++ b/08-quicksort/quicksort-solved.cpp
@ -0,0 +1,98 @@
+#include <iostream>
+#include <cstdlib>
+#include <ctime>
+#include <omp.h>
+#include <stdio.h>
+using namespace std;
+
+void init(int* array,int length){
+	srand((unsigned)time(0));
+	for(int i=0;i<length;i++)
+		array[i]=rand()%1234 + 1;
+}
+
+void swap(int &a, int &b){
+	int temp=a;
+	a=b;
+	b=temp;
+}
+
+
+int pivot(int* array, int first, int last){
+	int p = first;
+	int pivotElement = array[first];
+	for(int i=first+1;i<=last;i++){
+		if(array[i]<=pivotElement){
+			p++;
+			swap(array[i],array[p]);
+		}
+	}
+	swap(array[p],array[first]);
+	return p;
+}
+
+void serial_quicksort(int * array, int first, int last){
+	int pivotElement;
+	if(first<last){
+		pivotElement = pivot(array,first,last);
+		serial_quicksort(array,first,pivotElement-1);
+		serial_quicksort(array,pivotElement+1,last);
+	}
+}
+
+void quicksort(int * array, int first, int last){
+	int pivotElement;
+	if((last - first + 1) < 10000) {
+		serial_quicksort(array, first, last);
+	} else {
+		pivotElement = pivot(array,first,last);
+		#pragma omp task default(shared)
+		{
+			quicksort(array,first,pivotElement-1);
+		}
+		#pragma omp task default(shared)
+		{
+			quicksort(array,pivotElement+1,last);
+		}
+		#pragma omp taskwait
+	}
+}
+
+bool checkFn(int * array,int length){
+	for(int i=0;i<length-1;i++){
+		if(array[i]>array[i+1]){
+			cout << "array[" << i << "] > array[" << i+1 << "]" << endl;
+			return false;
+		}
+	}
+	return true;
+}
+
+int main(int argc, char *argv[]){
+	int length;
+	int *toBeSorted;
+	double t1,t2;
+	if(argc < 2){
+		length=6000000;
+	}else{
+		length=atoi(argv[1]);
+	}
+	toBeSorted = new int[length];
+
+	init(toBeSorted, length);
+	cout << "Sorting an array of " << length << " elements." << endl;	
+
+	t1=omp_get_wtime();
+	#pragma omp parallel
+	#pragma omp single
+	{
+		quicksort(toBeSorted,0,length-1);
+	}
+	t2=omp_get_wtime()-t1;
+
+	cout << "quicksort took " << t2 << " sec. to complete" << endl;
+	if (!checkFn(toBeSorted, length)) {
+		cout << "validation failed!" << endl;
+	}
+	delete [] toBeSorted;
+}
--- a/08-quicksort/quicksort.cpp
+++ b/08-quicksort/quicksort.cpp
@ -0,0 +1,76 @@
+#include <iostream>
+#include <cstdlib>
+#include <ctime>
+#include <omp.h>
+#include <stdio.h>
+using namespace std;
+
+void init(int* array,int length){
+	srand((unsigned)time(0));
+	for(int i=0;i<length;i++)
+		array[i]=rand()%1234 + 1;
+}
+
+void swap(int &a, int &b){
+	int temp=a;
+	a=b;
+	b=temp;
+}
+
+
+int pivot(int* array, int first, int last){
+	int p = first;
+	int pivotElement = array[first];
+	for(int i=first+1;i<=last;i++){
+		if(array[i]<=pivotElement){
+			p++;
+			swap(array[i],array[p]);
+		}
+	}
+	swap(array[p],array[first]);
+	return p;
+}
+
+void quicksort(int * array, int first, int last){
+	int pivotElement;
+	if(first<last){
+		pivotElement = pivot(array,first,last);
+		quicksort(array,first,pivotElement-1);
+		quicksort(array,pivotElement+1,last);
+	}
+}
+
+bool checkFn(int * array,int length){
+	for(int i=0;i<length-1;i++){
+		if(array[i]>array[i+1]){
+			cout << "array[" << i << "] > array[" << i+1 << "]" << endl;
+			return false;
+		}
+	}
+	return true;
+}
+
+int main(int argc, char *argv[]){
+	int length;
+	int *toBeSorted;
+	double t1,t2;
+	if(argc < 2){
+		length=6000000;
+	}else{
+		length=atoi(argv[1]);
+	}
+	toBeSorted = new int[length];
+
+	init(toBeSorted, length);
+	cout << "Sorting an array of " << length << " elements." << endl;	
+
+	t1=omp_get_wtime();
+	quicksort(toBeSorted,0,length-1);
+	t2=omp_get_wtime()-t1;
+
+	cout << "quicksort took " << t2 << " sec. to complete" << endl;
+	if (!checkFn(toBeSorted, length)) {
+		cout << "validation failed!" << endl;
+	}
+	delete [] toBeSorted;
+}
--- a/31
+++ b/31
@ -0,0 +1,31 @@
+ALL_RECURSIVE=all-recursive clean-recursive 
+
+TUT=SC19
+
+DIRS=$(shell ls -d *-* | sed '/_/d' 2>/dev/null)
+
+all:   $(CONFIG) all-recursive
+clean: $(CONFIG) clean-recursive
+
+$(ALL_RECURSIVE):
+	@failcom='exit 1';\
+   target=`echo $@ | sed s/-recursive//`; \
+   for subdir in $(DIRS); do \
+      (cd $$subdir && $(MAKE) $$target) || eval $$failcom; \
+   done;
+
+dist:
+	@failcom='exit 1';\
+   for subdir in $(DIRS); do \
+      (cd $$subdir && $(MAKE) clean) || eval $$failcom; \
+   done; \
+   tar --exclude="*-solved*" -cvzf ../mt-openmp-$(TUT).tar.gz --transform s/mt-openmp/mt-openmp-$(TUT)/ ../mt-openmp/*
+
+dist-solved:
+	@failcom='exit 1';\
+   for subdir in $(DIRS); do \
+      (cd $$subdir && $(MAKE) clean) || eval $$failcom; \
+   done; \
+   tar -cvzf ../mt-openmp-$(TUT)-solved.tar.gz --transform s/mt-openmp/mt-openmp-$(TUT)-solved/ ../mt-openmp/* 
+
+
--- a/slides/day-1/00-OpenMP_Introduction.pdf
+++ b/slides/day-1/00-OpenMP_Introduction.pdf
--- a/slides/day-1/01-OpenMP_Hands-on.pdf
+++ b/slides/day-1/01-OpenMP_Hands-on.pdf
--- a/slides/day-1/03-OpenMP-NUMA.pdf
+++ b/slides/day-1/03-OpenMP-NUMA.pdf
--- a/slides/day-2/04-OpenMP-Task-Intro.pdf
+++ b/slides/day-2/04-OpenMP-Task-Intro.pdf
--- a/slides/day-2/05-OpenMP-Task-Loops-Deps.pdf
+++ b/slides/day-2/05-OpenMP-Task-Loops-Deps.pdf
--- a/slides/day-2/06-OpenMP-SIMD.pdf
+++ b/slides/day-2/06-OpenMP-SIMD.pdf
--- a/slides/day-3/07-OpenMP-GPUs.pdf
+++ b/slides/day-3/07-OpenMP-GPUs.pdf
--- a/slides/day-3/08-Openmp-Misc-Advanced-Outlook.pdf
+++ b/slides/day-3/08-Openmp-Misc-Advanced-Outlook.pdf