forked from TASC/HLRS-OpenMP-GPU-2024
175 lines
4.9 KiB
C
175 lines
4.9 KiB
C
|
//===-- matmul.c - Different implementations of matrix multiplies -*- C -*-===//
|
||
|
//
|
||
|
// Part of the LOMP Project, under the Apache License v2.0 with LLVM Exceptions.
|
||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||
|
//
|
||
|
//===----------------------------------------------------------------------===//
|
||
|
|
||
|
#include <stdio.h>
|
||
|
#include <stdlib.h>
|
||
|
|
||
|
#include <omp.h>
|
||
|
|
||
|
#define N 3072
|
||
|
|
||
|
#define DUMP_MATRIX 0
|
||
|
#define DUMP_TASKS 0
|
||
|
|
||
|
void matmul_seq(float * C, float * A, float * B, size_t n) {
|
||
|
for (size_t i = 0; i < n; ++i) {
|
||
|
for (size_t k = 0; k < n; ++k) {
|
||
|
for (size_t j = 0; j < n; ++j) {
|
||
|
C[i * n + j] += A[i * n + k] * B[k * n + j];
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void matmul_par(float * C, float * A, float * B, size_t n) {
|
||
|
#pragma omp parallel for schedule(static,8) firstprivate(n)
|
||
|
for (size_t i = 0; i < n; ++i) {
|
||
|
for (size_t k = 0; k < n; ++k) {
|
||
|
for (size_t j = 0; j < n; ++j) {
|
||
|
C[i * n + j] += A[i * n + k] * B[k * n + j];
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void matmul_tloop(float * C, float * A, float * B, size_t n) {
|
||
|
#pragma omp parallel firstprivate(n)
|
||
|
#pragma omp single nowait
|
||
|
#pragma omp taskloop
|
||
|
for (size_t i = 0; i < n; ++i) {
|
||
|
for (size_t k = 0; k < n; ++k) {
|
||
|
for (size_t j = 0; j < n; ++j) {
|
||
|
C[i * n + j] += A[i * n + k] * B[k * n + j];
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void matmul_task(float * C, float * A, float * B, size_t n) {
|
||
|
const size_t bf = 256;
|
||
|
if (n % bf != 0) {
|
||
|
printf("Blocking factor does not divide matrix size!\n");
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
#pragma omp parallel firstprivate(n, bf)
|
||
|
#pragma omp master
|
||
|
{
|
||
|
// work on the blocks of the matrix
|
||
|
for (size_t ib = 0; ib < n; ib += bf)
|
||
|
for (size_t kb = 0; kb < n; kb += bf)
|
||
|
for (size_t jb = 0; jb < n; jb += bf) {
|
||
|
#pragma omp task firstprivate(ib, kb, jb) firstprivate(n, bf) \
|
||
|
depend(inout:C[ib * n + jb:bf]) \
|
||
|
depend(in:A[ib * n + kb:bf]) \
|
||
|
depend(in:B[kb * n + jb:bf])
|
||
|
{
|
||
|
#if DUMP_TASKS
|
||
|
printf("task: C[%ld,%ld] += A[%ld,%ld] o B[%ld,%ld]\n", ib, jb, ib, kb, kb, jb);
|
||
|
#endif
|
||
|
// work on a single block``
|
||
|
for (size_t i = ib; i < (ib + bf); ++i)
|
||
|
for (size_t k = kb; k < (kb + bf); ++k)
|
||
|
for (size_t j = jb; j < (jb + bf); ++j)
|
||
|
C[i * n + j] += A[i * n + k] * B[k * n + j];
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void init_mat(float * C, float * A, float * B, size_t n) {
|
||
|
for (size_t i = 0; i < n; ++i) {
|
||
|
for (size_t j = 0; j < n; ++j) {
|
||
|
C[i * n + j] = 0.0;
|
||
|
A[i * n + j] = 0.5;
|
||
|
B[i * n + j] = 0.25;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void dump_mat(float * mtx, size_t n) {
|
||
|
for (size_t i = 0; i < n; ++i) {
|
||
|
for (size_t j = 0; j < n; ++j) {
|
||
|
printf("%f ", mtx[i * n + j]);
|
||
|
}
|
||
|
printf("\n");
|
||
|
}
|
||
|
}
|
||
|
|
||
|
float sum_mat(float * mtx, size_t n) {
|
||
|
float sum = 0.0;
|
||
|
for (size_t i = 0; i < n; ++i) {
|
||
|
for (size_t j = 0; j < n; ++j) {
|
||
|
sum += mtx[i * n + j];
|
||
|
}
|
||
|
}
|
||
|
return sum;
|
||
|
}
|
||
|
|
||
|
int main(int argc, char * argv[]) {
|
||
|
double ts, te;
|
||
|
double t_seq;
|
||
|
|
||
|
float * C;
|
||
|
float * A;
|
||
|
float * B;
|
||
|
|
||
|
// If number of arguments is not 1, print help
|
||
|
if (argc != 2) {
|
||
|
printf("%s: matrix_size\n", argv[0]);
|
||
|
return EXIT_FAILURE;
|
||
|
}
|
||
|
const int n = atoi(argv[1]); // matrix size
|
||
|
|
||
|
C = (float *) malloc(sizeof(*C) * n * n);
|
||
|
A = (float *) malloc(sizeof(*A) * n * n);
|
||
|
B = (float *) malloc(sizeof(*B) * n * n);
|
||
|
|
||
|
init_mat(C, A, B, n);
|
||
|
ts = omp_get_wtime();
|
||
|
matmul_seq(C, A, B, n);
|
||
|
te = omp_get_wtime();
|
||
|
#if DUMP_MATRIX
|
||
|
dump_mat(C, n);
|
||
|
#endif
|
||
|
t_seq = te - ts;
|
||
|
printf("Sum of matrix (serial): %f, wall time %lf, speed-up %.2lf\n",
|
||
|
sum_mat(C, n), (te-ts), t_seq / (te-ts));
|
||
|
|
||
|
init_mat(C, A, B, n);
|
||
|
ts = omp_get_wtime();
|
||
|
matmul_par(C, A, B, n);
|
||
|
te = omp_get_wtime();
|
||
|
#if DUMP_MATRIX
|
||
|
dump_mat(C, n);
|
||
|
#endif
|
||
|
printf("Sum of matrix (parallel): %f, wall time %lf, speed-up %.2lf\n",
|
||
|
sum_mat(C, n), (te-ts), t_seq / (te-ts));
|
||
|
|
||
|
init_mat(C, A, B, n);
|
||
|
ts = omp_get_wtime();
|
||
|
matmul_tloop(C, A, B, n);
|
||
|
te = omp_get_wtime();
|
||
|
#if DUMP_MATRIX
|
||
|
dump_mat(C, n);
|
||
|
#endif
|
||
|
printf("Sum of matrix (taskloop): %f, wall time %lf, speed-up %.2lf\n",
|
||
|
sum_mat(C, n), (te-ts), t_seq / (te-ts));
|
||
|
|
||
|
init_mat(C, A, B, n);
|
||
|
ts = omp_get_wtime();
|
||
|
matmul_task(C, A, B, n);
|
||
|
te = omp_get_wtime();
|
||
|
#if DUMP_MATRIX
|
||
|
dump_mat(C, n);
|
||
|
#endif
|
||
|
printf("Sum of matrix (tasks): %f, wall time %lf, speed-up %.2lf\n",
|
||
|
sum_mat(C, n), (te-ts), t_seq / (te-ts));
|
||
|
|
||
|
return EXIT_SUCCESS;
|
||
|
}
|