//===-- matmul.c - Different implementations of matrix multiplies -*- C -*-===// // // Part of the LOMP Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include #include #include #define N 3072 #define DUMP_MATRIX 0 #define DUMP_TASKS 0 void matmul_seq(float * C, float * A, float * B, size_t n) { for (size_t i = 0; i < n; ++i) { for (size_t k = 0; k < n; ++k) { for (size_t j = 0; j < n; ++j) { C[i * n + j] += A[i * n + k] * B[k * n + j]; } } } } void matmul_par(float * C, float * A, float * B, size_t n) { #pragma omp parallel for schedule(static,8) firstprivate(n) for (size_t i = 0; i < n; ++i) { for (size_t k = 0; k < n; ++k) { for (size_t j = 0; j < n; ++j) { C[i * n + j] += A[i * n + k] * B[k * n + j]; } } } } void matmul_tloop(float * C, float * A, float * B, size_t n) { #pragma omp parallel firstprivate(n) #pragma omp single nowait #pragma omp taskloop for (size_t i = 0; i < n; ++i) { for (size_t k = 0; k < n; ++k) { for (size_t j = 0; j < n; ++j) { C[i * n + j] += A[i * n + k] * B[k * n + j]; } } } } void matmul_task(float * C, float * A, float * B, size_t n) { const size_t bf = 256; if (n % bf != 0) { printf("Blocking factor does not divide matrix size!\n"); exit(EXIT_FAILURE); } #pragma omp parallel firstprivate(n, bf) #pragma omp master { // work on the blocks of the matrix for (size_t ib = 0; ib < n; ib += bf) for (size_t kb = 0; kb < n; kb += bf) for (size_t jb = 0; jb < n; jb += bf) { #pragma omp task firstprivate(ib, kb, jb) firstprivate(n, bf) \ depend(inout:C[ib * n + jb:bf]) \ depend(in:A[ib * n + kb:bf]) \ depend(in:B[kb * n + jb:bf]) { #if DUMP_TASKS printf("task: C[%ld,%ld] += A[%ld,%ld] o B[%ld,%ld]\n", ib, jb, ib, kb, kb, jb); #endif // work on a single block`` for (size_t i = ib; i < (ib + bf); ++i) for (size_t k = kb; k < (kb + bf); ++k) for (size_t j = jb; j < (jb + bf); ++j) C[i * n + j] += A[i * n + k] * B[k * n + j]; } } } } void init_mat(float * C, float * A, float * B, size_t n) { for (size_t i = 0; i < n; ++i) { for (size_t j = 0; j < n; ++j) { C[i * n + j] = 0.0; A[i * n + j] = 0.5; B[i * n + j] = 0.25; } } } void dump_mat(float * mtx, size_t n) { for (size_t i = 0; i < n; ++i) { for (size_t j = 0; j < n; ++j) { printf("%f ", mtx[i * n + j]); } printf("\n"); } } float sum_mat(float * mtx, size_t n) { float sum = 0.0; for (size_t i = 0; i < n; ++i) { for (size_t j = 0; j < n; ++j) { sum += mtx[i * n + j]; } } return sum; } int main(int argc, char * argv[]) { double ts, te; double t_seq; float * C; float * A; float * B; // If number of arguments is not 1, print help if (argc != 2) { printf("%s: matrix_size\n", argv[0]); return EXIT_FAILURE; } const int n = atoi(argv[1]); // matrix size C = (float *) malloc(sizeof(*C) * n * n); A = (float *) malloc(sizeof(*A) * n * n); B = (float *) malloc(sizeof(*B) * n * n); init_mat(C, A, B, n); ts = omp_get_wtime(); matmul_seq(C, A, B, n); te = omp_get_wtime(); #if DUMP_MATRIX dump_mat(C, n); #endif t_seq = te - ts; printf("Sum of matrix (serial): %f, wall time %lf, speed-up %.2lf\n", sum_mat(C, n), (te-ts), t_seq / (te-ts)); init_mat(C, A, B, n); ts = omp_get_wtime(); matmul_par(C, A, B, n); te = omp_get_wtime(); #if DUMP_MATRIX dump_mat(C, n); #endif printf("Sum of matrix (parallel): %f, wall time %lf, speed-up %.2lf\n", sum_mat(C, n), (te-ts), t_seq / (te-ts)); init_mat(C, A, B, n); ts = omp_get_wtime(); matmul_tloop(C, A, B, n); te = omp_get_wtime(); #if DUMP_MATRIX dump_mat(C, n); #endif printf("Sum of matrix (taskloop): %f, wall time %lf, speed-up %.2lf\n", sum_mat(C, n), (te-ts), t_seq / (te-ts)); init_mat(C, A, B, n); ts = omp_get_wtime(); matmul_task(C, A, B, n); te = omp_get_wtime(); #if DUMP_MATRIX dump_mat(C, n); #endif printf("Sum of matrix (tasks): %f, wall time %lf, speed-up %.2lf\n", sum_mat(C, n), (te-ts), t_seq / (te-ts)); return EXIT_SUCCESS; }