Created
February 18, 2026 13:51
-
-
Save mdboom/09998002d205b682e60b6d6175e3d6f2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # cython: language_level=3 | |
| # distutils: language = c | |
| # distutils: libraries = cuda | |
| """ | |
| Minimal Cython wrapper for cuTensorMapEncodeTiled benchmark. | |
| This isolates Cython binding overhead from cuda-python's extra functionality. | |
| """ | |
| from libc.stdint cimport uint32_t, uint64_t | |
| from libc.string cimport memset | |
| # CUDA type definitions | |
| ctypedef unsigned int CUresult | |
| ctypedef int CUtensorMapDataType | |
| ctypedef int CUtensorMapInterleave | |
| ctypedef int CUtensorMapSwizzle | |
| ctypedef int CUtensorMapL2promotion | |
| ctypedef int CUtensorMapFloatOOBfill | |
| # CUtensorMap is 128 bytes opaque | |
| cdef struct CUtensorMap: | |
| unsigned char data[128] | |
| cdef extern from "cuda.h": | |
| CUresult cuTensorMapEncodeTiled( | |
| void* tensorMap, | |
| CUtensorMapDataType tensorDataType, | |
| uint32_t tensorRank, | |
| void* globalAddress, | |
| const uint64_t* globalDim, | |
| const uint64_t* globalStrides, | |
| const uint32_t* boxDim, | |
| const uint32_t* elementStrides, | |
| CUtensorMapInterleave interleave, | |
| CUtensorMapSwizzle swizzle, | |
| CUtensorMapL2promotion l2Promotion, | |
| CUtensorMapFloatOOBfill oobFill | |
| ) | |
| cdef CUresult cuTensorMapEncodeTiledDummy( | |
| void* tensorMap, | |
| CUtensorMapDataType tensorDataType, | |
| uint32_t tensorRank, | |
| void* globalAddress, | |
| const uint64_t* globalDim, | |
| const uint64_t* globalStrides, | |
| const uint32_t* boxDim, | |
| const uint32_t* elementStrides, | |
| CUtensorMapInterleave interleave, | |
| CUtensorMapSwizzle swizzle, | |
| CUtensorMapL2promotion l2Promotion, | |
| CUtensorMapFloatOOBfill oobFill | |
| ): | |
| return 0 # Simulate success for testing | |
| cdef class TensorMap: | |
| cdef CUtensorMap _desc | |
| def __cinit__(self): | |
| memset(&self._desc, 0, sizeof(CUtensorMap)) | |
| def create_tensor_map( | |
| uint32_t data_type, | |
| uint32_t rank, | |
| uint64_t global_address, | |
| list global_dim, | |
| list global_stride, | |
| list box_dim, | |
| list element_strides, | |
| uint32_t interleave, | |
| uint32_t swizzle, | |
| uint32_t l2_promotion, | |
| uint32_t oob_fill | |
| ): | |
| """Create a TMA descriptor - minimal Cython implementation.""" | |
| cdef TensorMap result = TensorMap() | |
| # Convert Python lists to C arrays | |
| cdef uint64_t c_global_dim[5] | |
| cdef uint64_t c_global_stride[5] | |
| cdef uint32_t c_box_dim[5] | |
| cdef uint32_t c_element_strides[5] | |
| cdef int i | |
| for i in range(len(global_dim)): | |
| c_global_dim[i] = global_dim[i] | |
| for i in range(len(global_stride)): | |
| c_global_stride[i] = global_stride[i] | |
| for i in range(len(box_dim)): | |
| c_box_dim[i] = box_dim[i] | |
| for i in range(len(element_strides)): | |
| c_element_strides[i] = element_strides[i] | |
| cdef CUresult err = cuTensorMapEncodeTiledDummy( | |
| <void *>&result._desc, | |
| <CUtensorMapDataType>data_type, | |
| rank, | |
| <void*>global_address, | |
| c_global_dim, | |
| c_global_stride, | |
| c_box_dim, | |
| c_element_strides, | |
| <CUtensorMapInterleave>interleave, | |
| <CUtensorMapSwizzle>swizzle, | |
| <CUtensorMapL2promotion>l2_promotion, | |
| <CUtensorMapFloatOOBfill>oob_fill | |
| ) | |
| err = 0 | |
| if err != 0: | |
| raise RuntimeError(f"cuTensorMapEncodeTiled failed: {err}") | |
| return result |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment