-
-
Save allanmac/4ff11985c3562830989f to your computer and use it in GitHub Desktop.
| // | |
| // | |
| // | |
| #include <stdlib.h> | |
| #include <stdio.h> | |
| // | |
| // | |
| // | |
| #include "assert_cuda.h" | |
| // | |
| // | |
| // | |
| cudaError_t | |
| cuda_assert(const cudaError_t code, const char* const file, const int line, const bool abort) | |
| { | |
| if (code != cudaSuccess) | |
| { | |
| fprintf(stderr,"cuda_assert: %s %s %d\n",cudaGetErrorString(code),file,line); | |
| if (abort) | |
| { | |
| cudaDeviceReset(); | |
| exit(code); | |
| } | |
| } | |
| return code; | |
| } | |
| // | |
| // | |
| // |
| // | |
| // | |
| // | |
| #pragma once | |
| // | |
| // | |
| // | |
| #include <cuda_runtime.h> | |
| #include <stdbool.h> | |
| // | |
| // Beware that NVCC doesn't work with C files and __VA_ARGS__ | |
| // | |
| cudaError_t cuda_assert(const cudaError_t code, const char* const file, const int line, const bool abort); | |
| #define cuda(...) cuda_assert((cuda##__VA_ARGS__), __FILE__, __LINE__, true); | |
| // | |
| // | |
| // |
| // | |
| // | |
| // | |
| #include <glad/glad.h> | |
| #include <GLFW/glfw3.h> | |
| #include <cuda_gl_interop.h> | |
| #include <stdlib.h> | |
| // | |
| // | |
| // | |
| #include "assert_cuda.h" | |
| #include "interop.h" | |
| // | |
| // | |
| // | |
| struct pxl_interop | |
| { | |
| // split GPUs? | |
| bool multi_gpu; | |
| // number of fbo's | |
| int count; | |
| int index; | |
| // w x h | |
| int width; | |
| int height; | |
| // GL buffers | |
| GLuint* fb; | |
| GLuint* rb; | |
| // CUDA resources | |
| cudaGraphicsResource_t* cgr; | |
| cudaArray_t* ca; | |
| }; | |
| // | |
| // | |
| // | |
| struct pxl_interop* | |
| pxl_interop_create(const bool multi_gpu, const int fbo_count) | |
| { | |
| struct pxl_interop* const interop = calloc(1,sizeof(*interop)); | |
| interop->multi_gpu = multi_gpu; | |
| interop->count = fbo_count; | |
| interop->index = 0; | |
| // allocate arrays | |
| interop->fb = calloc(fbo_count,sizeof(*(interop->fb ))); | |
| interop->rb = calloc(fbo_count,sizeof(*(interop->rb ))); | |
| interop->cgr = calloc(fbo_count,sizeof(*(interop->cgr))); | |
| interop->ca = calloc(fbo_count,sizeof(*(interop->ca))); | |
| // render buffer object w/a color buffer | |
| glCreateRenderbuffers(fbo_count,interop->rb); | |
| // frame buffer object | |
| glCreateFramebuffers(fbo_count,interop->fb); | |
| // attach rbo to fbo | |
| for (int index=0; index<fbo_count; index++) | |
| { | |
| glNamedFramebufferRenderbuffer(interop->fb[index], | |
| GL_COLOR_ATTACHMENT0, | |
| GL_RENDERBUFFER, | |
| interop->rb[index]); | |
| } | |
| // return it | |
| return interop; | |
| } | |
| void | |
| pxl_interop_destroy(struct pxl_interop* const interop) | |
| { | |
| cudaError_t cuda_err; | |
| // unregister CUDA resources | |
| for (int index=0; index<interop->count; index++) | |
| { | |
| if (interop->cgr[index] != NULL) | |
| cuda_err = cuda(GraphicsUnregisterResource(interop->cgr[index])); | |
| } | |
| // delete rbo's | |
| glDeleteRenderbuffers(interop->count,interop->rb); | |
| // delete fbo's | |
| glDeleteFramebuffers(interop->count,interop->fb); | |
| // free buffers and resources | |
| free(interop->fb); | |
| free(interop->rb); | |
| free(interop->cgr); | |
| free(interop->ca); | |
| // free interop | |
| free(interop); | |
| } | |
| // | |
| // | |
| // | |
| cudaError_t | |
| pxl_interop_size_set(struct pxl_interop* const interop, const int width, const int height) | |
| { | |
| cudaError_t cuda_err = cudaSuccess; | |
| // save new size | |
| interop->width = width; | |
| interop->height = height; | |
| // resize color buffer | |
| for (int index=0; index<interop->count; index++) | |
| { | |
| // unregister resource | |
| if (interop->cgr[index] != NULL) | |
| cuda_err = cuda(GraphicsUnregisterResource(interop->cgr[index])); | |
| // resize rbo | |
| glNamedRenderbufferStorage(interop->rb[index],GL_RGBA8,width,height); | |
| // probe fbo status | |
| // glCheckNamedFramebufferStatus(interop->fb[index],0); | |
| // register rbo | |
| cuda_err = cuda(GraphicsGLRegisterImage(&interop->cgr[index], | |
| interop->rb[index], | |
| GL_RENDERBUFFER, | |
| cudaGraphicsRegisterFlagsSurfaceLoadStore | | |
| cudaGraphicsRegisterFlagsWriteDiscard)); | |
| } | |
| // map graphics resources | |
| cuda_err = cuda(GraphicsMapResources(interop->count,interop->cgr,0)); | |
| // get CUDA Array refernces | |
| for (int index=0; index<interop->count; index++) | |
| { | |
| cuda_err = cuda(GraphicsSubResourceGetMappedArray(&interop->ca[index], | |
| interop->cgr[index], | |
| 0,0)); | |
| } | |
| // unmap graphics resources | |
| cuda_err = cuda(GraphicsUnmapResources(interop->count,interop->cgr,0)); | |
| return cuda_err; | |
| } | |
| void | |
| pxl_interop_size_get(struct pxl_interop* const interop, int* const width, int* const height) | |
| { | |
| *width = interop->width; | |
| *height = interop->height; | |
| } | |
| // | |
| // | |
| // | |
| cudaError_t | |
| pxl_interop_map(struct pxl_interop* const interop, cudaStream_t stream) | |
| { | |
| if (!interop->multi_gpu) | |
| return cudaSuccess; | |
| // map graphics resources | |
| return cuda(GraphicsMapResources(1,&interop->cgr[interop->index],stream)); | |
| } | |
| cudaError_t | |
| pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream) | |
| { | |
| if (!interop->multi_gpu) | |
| return cudaSuccess; | |
| return cuda(GraphicsUnmapResources(1,&interop->cgr[interop->index],stream)); | |
| } | |
| cudaError_t | |
| pxl_interop_array_map(struct pxl_interop* const interop) | |
| { | |
| // | |
| // FIXME -- IS THIS EVEN NEEDED? | |
| // | |
| cudaError_t cuda_err; | |
| // get a CUDA Array | |
| cuda_err = cuda(GraphicsSubResourceGetMappedArray(&interop->ca[interop->index], | |
| interop->cgr[interop->index], | |
| 0,0)); | |
| return cuda_err; | |
| } | |
| // | |
| // | |
| // | |
| cudaArray_const_t | |
| pxl_interop_array_get(struct pxl_interop* const interop) | |
| { | |
| return interop->ca[interop->index]; | |
| } | |
| int | |
| pxl_interop_index_get(struct pxl_interop* const interop) | |
| { | |
| return interop->index; | |
| } | |
| // | |
| // | |
| // | |
| void | |
| pxl_interop_swap(struct pxl_interop* const interop) | |
| { | |
| interop->index = (interop->index + 1) % interop->count; | |
| } | |
| // | |
| // | |
| // | |
| void | |
| pxl_interop_clear(struct pxl_interop* const interop) | |
| { | |
| /* | |
| static const GLenum attachments[] = { GL_COLOR_ATTACHMENT0 }; | |
| glInvalidateNamedFramebufferData(interop->fb[interop->index],1,attachments); | |
| */ | |
| const GLfloat clear_color[] = { 1.0f, 1.0f, 1.0f, 1.0f }; | |
| glClearNamedFramebufferfv(interop->fb[interop->index],GL_COLOR,0,clear_color); | |
| } | |
| // | |
| // | |
| // | |
| void | |
| pxl_interop_blit(struct pxl_interop* const interop) | |
| { | |
| glBlitNamedFramebuffer(interop->fb[interop->index],0, | |
| 0,0, interop->width,interop->height, | |
| 0,interop->height,interop->width,0, | |
| GL_COLOR_BUFFER_BIT, | |
| GL_NEAREST); | |
| } | |
| // | |
| // | |
| // |
| // | |
| // | |
| // | |
| #pragma once | |
| // | |
| // | |
| // | |
| #include <cuda_runtime.h> | |
| #include <stdbool.h> | |
| // | |
| // | |
| // | |
| struct pxl_interop* | |
| pxl_interop_create(const bool multi_gpu, const int fbo_count); | |
| void | |
| pxl_interop_destroy(struct pxl_interop* const interop); | |
| // | |
| // | |
| // | |
| cudaError_t | |
| pxl_interop_size_set(struct pxl_interop* const interop, const int width, const int height); | |
| void | |
| pxl_interop_size_get(struct pxl_interop* const interop, int* const width, int* const height); | |
| // | |
| // | |
| // | |
| cudaError_t | |
| pxl_interop_map(struct pxl_interop* const interop, cudaStream_t stream); | |
| cudaError_t | |
| pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream); | |
| cudaError_t | |
| pxl_interop_array_map(struct pxl_interop* const interop); | |
| // | |
| // | |
| // | |
| cudaArray_const_t | |
| pxl_interop_array_get(struct pxl_interop* const interop); | |
| cudaStream_t | |
| pxl_interop_stream_get(struct pxl_interop* const interop); | |
| int | |
| pxl_interop_index_get(struct pxl_interop* const interop); | |
| // | |
| // | |
| // | |
| void | |
| pxl_interop_swap(struct pxl_interop* const interop); | |
| void | |
| pxl_interop_clear(struct pxl_interop* const interop); | |
| void | |
| pxl_interop_blit(struct pxl_interop* const interop); | |
| // | |
| // | |
| // |
| // -*- compile-command: "nvcc arch sm_50 -Xptxas=-v -cubin kernel.cu"; -*- | |
| // | |
| // | |
| // | |
| #ifdef __cplusplus | |
| extern "C" { | |
| #endif | |
| #include "assert_cuda.h" | |
| #ifdef __cplusplus | |
| } | |
| #endif | |
| // | |
| // | |
| // | |
| #define PXL_KERNEL_THREADS_PER_BLOCK 256 // enough for 4Kx2 monitor | |
| // | |
| // | |
| // | |
| surface<void,cudaSurfaceType2D> surf; | |
| // | |
| // | |
| // | |
| union pxl_rgbx_24 | |
| { | |
| uint1 b32; | |
| struct { | |
| unsigned r : 8; | |
| unsigned g : 8; | |
| unsigned b : 8; | |
| unsigned na : 8; | |
| }; | |
| }; | |
| // | |
| // | |
| // | |
| extern "C" | |
| __global__ | |
| void | |
| pxl_kernel(const int width, const int height) | |
| { | |
| // pixel coordinates | |
| const int idx = (blockDim.x * blockIdx.x) + threadIdx.x; | |
| const int x = idx % width; | |
| const int y = idx / width; | |
| #if 1 | |
| // pixel color | |
| const int t = (unsigned int)clock() / 1100000; // 1.1 GHz | |
| const int xt = (idx + t) % width; | |
| const unsigned int ramp = (unsigned int)(((float)xt / (float)(width-1)) * 255.0f + 0.5f); | |
| const unsigned int bar = ((y + t) / 32) & 3; | |
| union pxl_rgbx_24 rgbx; | |
| rgbx.r = (bar == 0) || (bar == 1) ? ramp : 0; | |
| rgbx.g = (bar == 0) || (bar == 2) ? ramp : 0; | |
| rgbx.b = (bar == 0) || (bar == 3) ? ramp : 0; | |
| rgbx.na = 255; | |
| #else // DRAW A RED BORDER TO VALIDATE FLIPPED BLIT | |
| const bool border = (x == 0) || (x == width-1) || (y == 0) || (y == height-1); | |
| union pxl_rgbx_24 rgbx = { border ? 0xFF0000FF : 0xFF000000 }; | |
| #endif | |
| surf2Dwrite(rgbx.b32, // even simpler: (unsigned int)clock() | |
| surf, | |
| x*sizeof(rgbx), | |
| y, | |
| cudaBoundaryModeZero); // squelches out-of-bound writes | |
| } | |
| // | |
| // | |
| // | |
| extern "C" | |
| cudaError_t | |
| pxl_kernel_launcher(cudaArray_const_t array, | |
| const int width, | |
| const int height, | |
| cudaEvent_t event, | |
| cudaStream_t stream) | |
| { | |
| cudaError_t cuda_err; | |
| // cuda_err = cudaEventRecord(event,stream); | |
| cuda_err = cuda(BindSurfaceToArray(surf,array)); | |
| if (cuda_err) | |
| return cuda_err; | |
| const int blocks = (width * height + PXL_KERNEL_THREADS_PER_BLOCK - 1) / PXL_KERNEL_THREADS_PER_BLOCK; | |
| // cuda_err = cudaEventRecord(event,stream); | |
| if (blocks > 0) | |
| pxl_kernel<<<blocks,PXL_KERNEL_THREADS_PER_BLOCK,0,stream>>>(width,height); | |
| // cuda_err = cudaStreamWaitEvent(stream,event,0); | |
| return cudaSuccess; | |
| } | |
| // | |
| // | |
| // |
| // | |
| // | |
| // | |
| #include <glad/glad.h> | |
| #include <GLFW/glfw3.h> | |
| // | |
| // | |
| // | |
| #include <stdlib.h> | |
| #include <stdio.h> | |
| #include <stdbool.h> | |
| // | |
| // | |
| // | |
| #include <cuda_gl_interop.h> | |
| // | |
| // | |
| // | |
| #include "assert_cuda.h" | |
| #include "interop.h" | |
| // | |
| // FPS COUNTER FROM HERE: | |
| // | |
| // http://antongerdelan.net/opengl/glcontext2.html | |
| // | |
| static | |
| void | |
| pxl_glfw_fps(GLFWwindow* window) | |
| { | |
| // static fps counters | |
| static double stamp_prev = 0.0; | |
| static int frame_count = 0; | |
| // locals | |
| const double stamp_curr = glfwGetTime(); | |
| const double elapsed = stamp_curr - stamp_prev; | |
| if (elapsed > 0.5) | |
| { | |
| stamp_prev = stamp_curr; | |
| const double fps = (double)frame_count / elapsed; | |
| int width, height; | |
| char tmp[64]; | |
| glfwGetFramebufferSize(window,&width,&height); | |
| sprintf_s(tmp,64,"(%u x %u) - FPS: %.2f",width,height,fps); | |
| glfwSetWindowTitle(window,tmp); | |
| frame_count = 0; | |
| } | |
| frame_count++; | |
| } | |
| // | |
| // | |
| // | |
| static | |
| void | |
| pxl_glfw_error_callback(int error, const char* description) | |
| { | |
| fputs(description,stderr); | |
| } | |
| static | |
| void | |
| pxl_glfw_key_callback(GLFWwindow* window, int key, int scancode, int action, int mods) | |
| { | |
| if (key == GLFW_KEY_ESCAPE && action == GLFW_PRESS) | |
| glfwSetWindowShouldClose(window, GL_TRUE); | |
| } | |
| static | |
| void | |
| pxl_glfw_init(GLFWwindow** window, const int width, const int height) | |
| { | |
| // | |
| // INITIALIZE GLFW/GLAD | |
| // | |
| glfwSetErrorCallback(pxl_glfw_error_callback); | |
| if (!glfwInit()) | |
| exit(EXIT_FAILURE); | |
| glfwWindowHint(GLFW_DEPTH_BITS, 0); | |
| glfwWindowHint(GLFW_STENCIL_BITS, 0); | |
| glfwWindowHint(GLFW_SRGB_CAPABLE, GL_TRUE); | |
| glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 4); | |
| glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 5); | |
| glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE); | |
| #ifdef PXL_FULLSCREEN | |
| GLFWmonitor* monitor = glfwGetPrimaryMonitor(); | |
| const GLFWvidmode* mode = glfwGetVideoMode(monitor); | |
| *window = glfwCreateWindow(mode->width,mode->height,"GLFW / CUDA Interop",monitor,NULL); | |
| #else | |
| *window = glfwCreateWindow(width,height,"GLFW / CUDA Interop",NULL,NULL); | |
| #endif | |
| if (*window == NULL) | |
| { | |
| glfwTerminate(); | |
| exit(EXIT_FAILURE); | |
| } | |
| glfwMakeContextCurrent(*window); | |
| // set up GLAD | |
| gladLoadGLLoader((GLADloadproc)glfwGetProcAddress); | |
| // ignore vsync for now | |
| glfwSwapInterval(0); | |
| // only copy r/g/b | |
| glColorMask(GL_TRUE,GL_TRUE,GL_TRUE,GL_FALSE); | |
| // enable SRGB | |
| // glEnable(GL_FRAMEBUFFER_SRGB); | |
| } | |
| // | |
| // | |
| // | |
| static | |
| void | |
| pxl_glfw_window_size_callback(GLFWwindow* window, int width, int height) | |
| { | |
| // get context | |
| struct pxl_interop* const interop = glfwGetWindowUserPointer(window); | |
| pxl_interop_size_set(interop,width,height); | |
| } | |
| // | |
| // | |
| // | |
| cudaError_t | |
| pxl_kernel_launcher(cudaArray_const_t array, | |
| const int width, | |
| const int height, | |
| cudaEvent_t event, | |
| cudaStream_t stream); | |
| // | |
| // | |
| // | |
| int | |
| main(int argc, char* argv[]) | |
| { | |
| // | |
| // INIT GLFW | |
| // | |
| GLFWwindow* window; | |
| pxl_glfw_init(&window,1024,1024); | |
| // | |
| // INIT CUDA | |
| // | |
| cudaError_t cuda_err; | |
| int gl_device_id,gl_device_count; | |
| cuda_err = cuda(GLGetDevices(&gl_device_count,&gl_device_id,1,cudaGLDeviceListAll)); | |
| int cuda_device_id = (argc > 1) ? atoi(argv[1]) : gl_device_id; | |
| cuda_err = cuda(SetDevice(cuda_device_id)); | |
| // | |
| // MULTI-GPU? | |
| // | |
| const bool multi_gpu = gl_device_id != cuda_device_id; | |
| // | |
| // INFO | |
| // | |
| struct cudaDeviceProp props; | |
| cuda_err = cuda(GetDeviceProperties(&props,gl_device_id)); | |
| printf("GL : %-24s (%2d)\n",props.name,props.multiProcessorCount); | |
| cuda_err = cuda(GetDeviceProperties(&props,cuda_device_id)); | |
| printf("CUDA : %-24s (%2d)\n",props.name,props.multiProcessorCount); | |
| // | |
| // CREATE CUDA STREAM & EVENT | |
| // | |
| cudaStream_t stream; | |
| cudaEvent_t event; | |
| cuda_err = cuda(StreamCreateWithFlags(&stream,cudaStreamDefault)); // optionally ignore default stream behavior | |
| cuda_err = cuda(EventCreateWithFlags(&event,cudaEventBlockingSync)); // | cudaEventDisableTiming); | |
| // | |
| // CREATE INTEROP | |
| // | |
| // TESTING -- DO NOT SET TO FALSE, ONLY TRUE IS RELIABLE | |
| struct pxl_interop* const interop = pxl_interop_create(true /*multi_gpu*/,2); | |
| // | |
| // RESIZE INTEROP | |
| // | |
| int width, height; | |
| // get initial width/height | |
| glfwGetFramebufferSize(window,&width,&height); | |
| // resize with initial window dimensions | |
| cuda_err = pxl_interop_size_set(interop,width,height); | |
| // | |
| // SET USER POINTER AND CALLBACKS | |
| // | |
| glfwSetWindowUserPointer (window,interop); | |
| glfwSetKeyCallback (window,pxl_glfw_key_callback); | |
| glfwSetFramebufferSizeCallback(window,pxl_glfw_window_size_callback); | |
| // | |
| // LOOP UNTIL DONE | |
| // | |
| while (!glfwWindowShouldClose(window)) | |
| { | |
| // | |
| // MONITOR FPS | |
| // | |
| pxl_glfw_fps(window); | |
| // | |
| // EXECUTE CUDA KERNEL ON RENDER BUFFER | |
| // | |
| int width,height; | |
| cudaArray_t cuda_array; | |
| pxl_interop_size_get(interop,&width,&height); | |
| cuda_err = pxl_interop_map(interop,stream); | |
| cuda_err = pxl_kernel_launcher(pxl_interop_array_get(interop), | |
| width, | |
| height, | |
| event, | |
| stream); | |
| cuda_err = pxl_interop_unmap(interop,stream); | |
| // | |
| // BLIT & SWAP FBO | |
| // | |
| pxl_interop_blit(interop); | |
| // pxl_interop_clear(interop); | |
| pxl_interop_swap(interop); | |
| // | |
| // SWAP WINDOW | |
| // | |
| glfwSwapBuffers(window); | |
| // | |
| // PUMP/POLL/WAIT | |
| // | |
| glfwPollEvents(); // glfwWaitEvents(); | |
| } | |
| // | |
| // CLEANUP | |
| // | |
| pxl_interop_destroy(interop); | |
| glfwDestroyWindow(window); | |
| glfwTerminate(); | |
| cuda(DeviceReset()); | |
| // missing some clean up here | |
| exit(EXIT_SUCCESS); | |
| } | |
| // | |
| // | |
| // |
Hi
Just went through the code. I understand CUDA but not very proficient in OpenGL. Do you have some help files to initiate a beginner like me? Thanks.
This sample code can only send buffer data from CUDA (any latest version) to OpenGL (the latest version is 4.6), right?
Yes, I think that's correct. I wrote this a very long time ago in order to understand CUDA>GL interop.
Today I would just use Vulkan.
Thanks @allanmac for your reply. The same thing I am currently trying to understand. I am using OptiX (latest version 7.4) and need to send the rendered data to opengl buffer. There is no such thing called OptiX->OGL interoperability, but CUDA to OpenGL possible. I have run your code on my machine, it's working perfectly, and now I'm trying to understand what you actually did. Can you suggest some resources about CUDA->OpenGL interoperability?
If I remember correctly, there was no guide on this subject and I was mostly using the Runtime API docs and the GL interop example in the "CUDA Samples/" directory in the SDK.
It looks like the Samples/ are now on GitHub: https://github.com/NVIDIA/cuda-samples/tree/master/Samples/5_Domain_Specific/simpleGL
IMHO, there has not been much changed on this topic. I think it is not something that many developers really need. Thank you very much for the resource.

On Windows you can build with something like this:
nvcc -Xptxas=-v -o interop -I glad\output\include -I glfw\x64\include glad\output\src\glad.c main.c interop.c assert_cuda.c kernel.cu glfw\x64\lib-vc2013\glfw3dll.libThe executable requires the glfw3.dll.