Created
October 1, 2016 12:42
-
-
Save jarvisnn/c211e44af9f4334fdb0c245ae9ca9f8a to your computer and use it in GitHub Desktop.
Matrix Multiplication code on GPU with CUDA
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /** | |
| * | |
| * Matrix Multiplication - CUDA for GPUs | |
| * | |
| * NUS CS3210 - Jarvis | |
| * | |
| **/ | |
| #include <stdio.h> | |
| #include <stdlib.h> | |
| #include <time.h> | |
| #include <sys/time.h> | |
| #include <assert.h> | |
| // Block Size | |
| #define BLOCK_SIZE 32 | |
| int size; | |
| typedef struct | |
| { | |
| float ** element; | |
| } matrix; | |
| long long wall_clock_time() | |
| { | |
| #ifdef __linux__ | |
| struct timespec tp; | |
| clock_gettime(CLOCK_REALTIME, &tp); | |
| return (long long)(tp.tv_nsec + (long long)tp.tv_sec * 1000000000ll); | |
| #else | |
| struct timeval tv; | |
| gettimeofday(&tv, NULL); | |
| return (long long)(tv.tv_usec * 1000 + (long long)tv.tv_sec * 1000000000ll); | |
| #endif | |
| } | |
| /** | |
| * Allocates memory for a matrix of size SIZE | |
| * The memory is allocated row-major order, i.e. | |
| * elements from the same row are allocated at contiguous | |
| * memory addresses. | |
| **/ | |
| void allocate_matrix(matrix* m) | |
| { | |
| int i; | |
| cudaError_t rc; | |
| // allocate array for all the rows | |
| rc = cudaMallocManaged((void**)&(m->element), sizeof(float*) * size); | |
| if (rc != cudaSuccess) | |
| { | |
| fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(rc)); | |
| exit(1); | |
| } | |
| // allocate an array for each row of the matrix | |
| for (i = 0; i < size; i++) | |
| { | |
| rc = cudaMallocManaged((void**)&(m->element[i]), sizeof(float) * size); | |
| if (rc != cudaSuccess) | |
| { | |
| fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(rc)); | |
| exit(1); | |
| } | |
| } | |
| } | |
| /** | |
| * Free the memory allocated for a matrix. | |
| **/ | |
| void free_matrix(matrix* m) { | |
| int i; | |
| for (i = 0; i < size; i++) | |
| cudaFree(m->element[i]); | |
| cudaFree(m->element); | |
| } | |
| /** | |
| * Initializes the elements of the matrix with | |
| * random values between 0 and 9 | |
| **/ | |
| void init_matrix(matrix m) | |
| { | |
| int i, j; | |
| for (i = 0; i < size; i++) | |
| for (j = 0; j < size; j++) | |
| { | |
| m.element[i][j] = rand() % 10; | |
| } | |
| } | |
| /** | |
| * Initializes the elements of the matrix with | |
| * element 0. | |
| **/ | |
| void init_matrix_zero(matrix m) | |
| { | |
| int i, j; | |
| for (i = 0; i < size; i++) | |
| for (j = 0; j < size; j++) | |
| { | |
| m.element[i][j] = 0.0; | |
| } | |
| } | |
| /** | |
| * Multiplies matrix @a with matrix @b storing | |
| * the result in matrix @result | |
| * | |
| * The multiplication algorithm is the O(n^3) | |
| * algorithm | |
| */ | |
| void mm(matrix a, matrix b, matrix result) | |
| { | |
| int i, j, k; | |
| // Do the multiplication | |
| for (i = 0; i < size; i++) | |
| for (j = 0; j < size; j++) | |
| for(k = 0; k < size; k++) | |
| result.element[i][j] += a.element[i][k] * b.element[k][j]; | |
| } | |
| /** | |
| * Each kernel computes the result element (i,j). | |
| */ | |
| __global__ void mm_kernel(matrix a, matrix b, matrix result, int size) | |
| { | |
| // Block Index and Thread Index | |
| int bx = blockIdx.x; | |
| int by = blockIdx.y; | |
| int tx = threadIdx.x; | |
| int ty = threadIdx.y; | |
| // Current cell to be calculated (result[cx][cy]); | |
| int cx = bx * BLOCK_SIZE + tx; | |
| int cy = by * BLOCK_SIZE + ty; | |
| // Variables | |
| int blkStart, k; | |
| // Variable to store the value of result[cx][cy] | |
| float c = 0; | |
| // Go through sub-matrices a[BLOCK_SIZE][size] and b[size, BLOCK_SIZE] | |
| // Do not load all at one time. Load these sub-matrices by block (sub-sub-matrices) of size (BLOCK_SIZE, BLOCK_SIZE). | |
| for (blkStart = 0; blkStart < size; blkStart += BLOCK_SIZE) { | |
| // Shared mem for the sub-sub-matrices of a and b | |
| __shared__ float a_sub[BLOCK_SIZE][BLOCK_SIZE]; | |
| __shared__ float b_sub[BLOCK_SIZE][BLOCK_SIZE]; | |
| // Load sub-sub-matrices, each thread load 1 cell only | |
| a_sub[tx][ty] = (cx < size && blkStart + ty < size) ? a.element[cx][blkStart + ty] : 0; | |
| b_sub[tx][ty] = (blkStart + tx < size && cy < size) ? b.element[blkStart + tx][cy] : 0; | |
| // Make sure all data is loaded. | |
| __syncthreads(); | |
| // For-loop to calculate the value of result[cx][cy] by 2 sub-sub-matrices. | |
| // Unroll is a minor improvement of Cuda for simple for-loop. | |
| #pragma unroll | |
| for (k = 0; k < BLOCK_SIZE; k++) { | |
| c += a_sub[tx][k] * b_sub[k][ty]; | |
| } | |
| // Make sure all computations are done before the next phase. | |
| __syncthreads(); | |
| } | |
| // Verify the cell, add to the result. | |
| if (cx >= size || cy >= size) return; | |
| result.element[cx][cy] = c; | |
| } | |
| void print_matrix(matrix m) | |
| { | |
| int i, j; | |
| for (i = 0; i < size; i++) | |
| { | |
| printf("row %4d: ", i); | |
| for (j = 0; j < size; j++) | |
| printf("%6.2f ", m.element[i][j]); | |
| printf("\n"); | |
| } | |
| } | |
| void work() | |
| { | |
| matrix a, b, result1, result2; | |
| long long before, after; | |
| int correct, i, j, dim; | |
| cudaError_t rc; | |
| // Allocate memory for matrices | |
| allocate_matrix(&a); | |
| allocate_matrix(&b); | |
| allocate_matrix(&result1); | |
| allocate_matrix(&result2); | |
| // Initialize matrix elements | |
| init_matrix(a); | |
| init_matrix(b); | |
| // Perform sequential matrix multiplication | |
| before = wall_clock_time(); | |
| mm(a, b, result1); | |
| after = wall_clock_time(); | |
| fprintf(stderr, "Matrix multiplication on CPU took %1.2f seconds\n", ((float)(after - before))/1000000000); | |
| // Perform CUDA matrix multiplication | |
| dim3 block(BLOCK_SIZE, BLOCK_SIZE); // a block of BLOCK_SIZE x BLOCK_SIZE CUDA threads | |
| dim = (size % BLOCK_SIZE == 0) ? size / BLOCK_SIZE : size / BLOCK_SIZE + 1; | |
| dim3 grid(dim, dim); // a grid of CUDA thread blocks | |
| before = wall_clock_time(); | |
| mm_kernel<<<grid, block>>>(a, b, result2, size); | |
| cudaDeviceSynchronize(); | |
| after = wall_clock_time(); | |
| fprintf(stderr, "Matrix multiplication on GPU took %1.2f seconds\n", ((float)(after - before))/1000000000); | |
| // was there any error? | |
| rc = cudaGetLastError(); | |
| if (rc != cudaSuccess) | |
| printf("Last CUDA error %s\n", cudaGetErrorString(rc)); | |
| // Compare the results | |
| correct = 1; | |
| for (i = 0; correct && i < size; i++) | |
| for (j = 0; j < size; j++) | |
| if (result1.element[i][j] != result2.element[i][j]) { | |
| correct = 0; | |
| break; | |
| } | |
| if (correct) | |
| printf("The result matrices are identical!\n"); | |
| else | |
| printf("Difference in result matrices at element (%d, %d)!\n", i, j); | |
| free_matrix(&a); | |
| free_matrix(&b); | |
| free_matrix(&result1); | |
| free_matrix(&result2); | |
| } | |
| int main(int argc, char ** argv) | |
| { | |
| srand(0); | |
| printf("Usage: %s <size>\n", argv[0]); | |
| if (argc >= 2) | |
| size = atoi(argv[1]); | |
| else | |
| size = 1024; | |
| fprintf(stderr,"Sequential matrix multiplication of size %d\n", size); | |
| // Multiply the matrices | |
| work(); | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Simple code of parallel matrix multiplication in CUDA C and another thing we try the different block size and check the time
https://debuggingsolution.blogspot.com/2021/11/matrix-multiplication-in-cuda.html