Skip to content

Instantly share code, notes, and snippets.

@jarvisnn
Created October 1, 2016 12:42
Show Gist options
  • Select an option

  • Save jarvisnn/c211e44af9f4334fdb0c245ae9ca9f8a to your computer and use it in GitHub Desktop.

Select an option

Save jarvisnn/c211e44af9f4334fdb0c245ae9ca9f8a to your computer and use it in GitHub Desktop.
Matrix Multiplication code on GPU with CUDA
/**
*
* Matrix Multiplication - CUDA for GPUs
*
* NUS CS3210 - Jarvis
*
**/
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <assert.h>
// Block Size
#define BLOCK_SIZE 32
int size;
typedef struct
{
float ** element;
} matrix;
long long wall_clock_time()
{
#ifdef __linux__
struct timespec tp;
clock_gettime(CLOCK_REALTIME, &tp);
return (long long)(tp.tv_nsec + (long long)tp.tv_sec * 1000000000ll);
#else
struct timeval tv;
gettimeofday(&tv, NULL);
return (long long)(tv.tv_usec * 1000 + (long long)tv.tv_sec * 1000000000ll);
#endif
}
/**
* Allocates memory for a matrix of size SIZE
* The memory is allocated row-major order, i.e.
* elements from the same row are allocated at contiguous
* memory addresses.
**/
void allocate_matrix(matrix* m)
{
int i;
cudaError_t rc;
// allocate array for all the rows
rc = cudaMallocManaged((void**)&(m->element), sizeof(float*) * size);
if (rc != cudaSuccess)
{
fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(rc));
exit(1);
}
// allocate an array for each row of the matrix
for (i = 0; i < size; i++)
{
rc = cudaMallocManaged((void**)&(m->element[i]), sizeof(float) * size);
if (rc != cudaSuccess)
{
fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(rc));
exit(1);
}
}
}
/**
* Free the memory allocated for a matrix.
**/
void free_matrix(matrix* m) {
int i;
for (i = 0; i < size; i++)
cudaFree(m->element[i]);
cudaFree(m->element);
}
/**
* Initializes the elements of the matrix with
* random values between 0 and 9
**/
void init_matrix(matrix m)
{
int i, j;
for (i = 0; i < size; i++)
for (j = 0; j < size; j++)
{
m.element[i][j] = rand() % 10;
}
}
/**
* Initializes the elements of the matrix with
* element 0.
**/
void init_matrix_zero(matrix m)
{
int i, j;
for (i = 0; i < size; i++)
for (j = 0; j < size; j++)
{
m.element[i][j] = 0.0;
}
}
/**
* Multiplies matrix @a with matrix @b storing
* the result in matrix @result
*
* The multiplication algorithm is the O(n^3)
* algorithm
*/
void mm(matrix a, matrix b, matrix result)
{
int i, j, k;
// Do the multiplication
for (i = 0; i < size; i++)
for (j = 0; j < size; j++)
for(k = 0; k < size; k++)
result.element[i][j] += a.element[i][k] * b.element[k][j];
}
/**
* Each kernel computes the result element (i,j).
*/
__global__ void mm_kernel(matrix a, matrix b, matrix result, int size)
{
// Block Index and Thread Index
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
// Current cell to be calculated (result[cx][cy]);
int cx = bx * BLOCK_SIZE + tx;
int cy = by * BLOCK_SIZE + ty;
// Variables
int blkStart, k;
// Variable to store the value of result[cx][cy]
float c = 0;
// Go through sub-matrices a[BLOCK_SIZE][size] and b[size, BLOCK_SIZE]
// Do not load all at one time. Load these sub-matrices by block (sub-sub-matrices) of size (BLOCK_SIZE, BLOCK_SIZE).
for (blkStart = 0; blkStart < size; blkStart += BLOCK_SIZE) {
// Shared mem for the sub-sub-matrices of a and b
__shared__ float a_sub[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float b_sub[BLOCK_SIZE][BLOCK_SIZE];
// Load sub-sub-matrices, each thread load 1 cell only
a_sub[tx][ty] = (cx < size && blkStart + ty < size) ? a.element[cx][blkStart + ty] : 0;
b_sub[tx][ty] = (blkStart + tx < size && cy < size) ? b.element[blkStart + tx][cy] : 0;
// Make sure all data is loaded.
__syncthreads();
// For-loop to calculate the value of result[cx][cy] by 2 sub-sub-matrices.
// Unroll is a minor improvement of Cuda for simple for-loop.
#pragma unroll
for (k = 0; k < BLOCK_SIZE; k++) {
c += a_sub[tx][k] * b_sub[k][ty];
}
// Make sure all computations are done before the next phase.
__syncthreads();
}
// Verify the cell, add to the result.
if (cx >= size || cy >= size) return;
result.element[cx][cy] = c;
}
void print_matrix(matrix m)
{
int i, j;
for (i = 0; i < size; i++)
{
printf("row %4d: ", i);
for (j = 0; j < size; j++)
printf("%6.2f ", m.element[i][j]);
printf("\n");
}
}
void work()
{
matrix a, b, result1, result2;
long long before, after;
int correct, i, j, dim;
cudaError_t rc;
// Allocate memory for matrices
allocate_matrix(&a);
allocate_matrix(&b);
allocate_matrix(&result1);
allocate_matrix(&result2);
// Initialize matrix elements
init_matrix(a);
init_matrix(b);
// Perform sequential matrix multiplication
before = wall_clock_time();
mm(a, b, result1);
after = wall_clock_time();
fprintf(stderr, "Matrix multiplication on CPU took %1.2f seconds\n", ((float)(after - before))/1000000000);
// Perform CUDA matrix multiplication
dim3 block(BLOCK_SIZE, BLOCK_SIZE); // a block of BLOCK_SIZE x BLOCK_SIZE CUDA threads
dim = (size % BLOCK_SIZE == 0) ? size / BLOCK_SIZE : size / BLOCK_SIZE + 1;
dim3 grid(dim, dim); // a grid of CUDA thread blocks
before = wall_clock_time();
mm_kernel<<<grid, block>>>(a, b, result2, size);
cudaDeviceSynchronize();
after = wall_clock_time();
fprintf(stderr, "Matrix multiplication on GPU took %1.2f seconds\n", ((float)(after - before))/1000000000);
// was there any error?
rc = cudaGetLastError();
if (rc != cudaSuccess)
printf("Last CUDA error %s\n", cudaGetErrorString(rc));
// Compare the results
correct = 1;
for (i = 0; correct && i < size; i++)
for (j = 0; j < size; j++)
if (result1.element[i][j] != result2.element[i][j]) {
correct = 0;
break;
}
if (correct)
printf("The result matrices are identical!\n");
else
printf("Difference in result matrices at element (%d, %d)!\n", i, j);
free_matrix(&a);
free_matrix(&b);
free_matrix(&result1);
free_matrix(&result2);
}
int main(int argc, char ** argv)
{
srand(0);
printf("Usage: %s <size>\n", argv[0]);
if (argc >= 2)
size = atoi(argv[1]);
else
size = 1024;
fprintf(stderr,"Sequential matrix multiplication of size %d\n", size);
// Multiply the matrices
work();
return 0;
}
@rawstar134
Copy link

Simple code of parallel matrix multiplication in CUDA C and another thing we try the different block size and check the time
https://debuggingsolution.blogspot.com/2021/11/matrix-multiplication-in-cuda.html

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment