Created
June 25, 2025 07:22
-
-
Save sin3point14/4b11f7babd6d90bda18aa5e8fd0072a6 to your computer and use it in GitHub Desktop.
Eigen cuda inverse test
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <Eigen/Dense> | |
| #include <iostream> | |
| using mat = Eigen::Matrix<float, 5, 5>; | |
| template <typename MatrixType> | |
| __device__ void printCudaMatrix(const MatrixType& matrix) { | |
| for (int row = 0; row < matrix.rows(); ++row) | |
| { | |
| printf(" Row %3d: [", row); | |
| for (int col = 0; col < matrix.cols(); ++col) | |
| { | |
| printf("%15.8e ", (double)matrix(row, col)); | |
| } | |
| printf("]\n"); | |
| } | |
| } | |
| __global__ void matrixInv(mat* A) { | |
| printf("kernel A:\n"); | |
| printCudaMatrix(*A); | |
| mat B = (*A).inverse(); | |
| printf("kernel B:\n"); | |
| printCudaMatrix(B); | |
| *A = B; | |
| printf("kernel Ainv:\n"); | |
| printCudaMatrix(*A); | |
| } | |
| int main() { | |
| srand(0); | |
| mat* d_A; | |
| mat h_A = mat::Random(); | |
| std::cout << "CPU:\n" << h_A << std::endl; | |
| cudaMalloc((void**)&d_A, sizeof(mat)); | |
| cudaMemcpy(d_A, &h_A, sizeof(mat), cudaMemcpyHostToDevice); | |
| matrixInv<<<1, 1>>>(d_A); | |
| cudaDeviceSynchronize(); | |
| mat h_Ainv; | |
| cudaMemcpy(&h_Ainv, d_A, sizeof(mat), cudaMemcpyDeviceToHost); | |
| std::cout << "CPU final inv:\n" << h_A.inverse() << std::endl; | |
| std::cout << "GPU final inv:\n" << h_Ainv << std::endl; | |
| cudaFree(d_A); | |
| return 0; | |
| } |
Author
Author
I tried seeing what happens to the generated ptx for various matrix inverses at https://godbolt.org/z/hP6zeEh5E for 1x1...4x4 matrices. I am using the NVCC 12.5.1 compiler with -arch=sm_86 -O2 options. The matrixInv in the PTX view seems to contain some asm which grows in size with increasing dimensions, hence I assume that it is correct. However when I shift to 5x5 the function simply returns:
.visible .entry matrixInv(Eigen::Matrix<float, 5, 5, 0, 5, 5>*)(
.param .u64 matrixInv(Eigen::Matrix<float, 5, 5, 0, 5, 5>*)_param_0
)
{
ret;
}
Author
Compiling with clang++ however seems to throw an error as expected. I used the trunk sm_100a CUDA-12.8.1 compiler with -O2. Somehow adding --cuda-gpu-arch=sm_86 throws some error so i just skipped it.
In file included from <source>:1:
In file included from /opt/compiler-explorer/libs/eigen/v3.4.0/Eigen/Dense:2:
In file included from /opt/compiler-explorer/libs/eigen/v3.4.0/Eigen/LU:39:
/opt/compiler-explorer/libs/eigen/v3.4.0/Eigen/src/LU/InverseImpl.h:28:21: error: reference to __host__ function 'partialPivLu' in __host__ __device__ function
28 | result = matrix.partialPivLu().inverse();
| ^
...
As expected clang++ refuses to compile iterative functions on the GPU
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
using nvcc:
Eigen inverse gives back the same matrix and we have no compilation/runtime errors