Skip to content

Instantly share code, notes, and snippets.

@mdboom
Created February 18, 2026 13:51
Show Gist options
  • Select an option

  • Save mdboom/09998002d205b682e60b6d6175e3d6f2 to your computer and use it in GitHub Desktop.

Select an option

Save mdboom/09998002d205b682e60b6d6175e3d6f2 to your computer and use it in GitHub Desktop.
# cython: language_level=3
# distutils: language = c
# distutils: libraries = cuda
"""
Minimal Cython wrapper for cuTensorMapEncodeTiled benchmark.
This isolates Cython binding overhead from cuda-python's extra functionality.
"""
from libc.stdint cimport uint32_t, uint64_t
from libc.string cimport memset
# CUDA type definitions
ctypedef unsigned int CUresult
ctypedef int CUtensorMapDataType
ctypedef int CUtensorMapInterleave
ctypedef int CUtensorMapSwizzle
ctypedef int CUtensorMapL2promotion
ctypedef int CUtensorMapFloatOOBfill
# CUtensorMap is 128 bytes opaque
cdef struct CUtensorMap:
unsigned char data[128]
cdef extern from "cuda.h":
CUresult cuTensorMapEncodeTiled(
void* tensorMap,
CUtensorMapDataType tensorDataType,
uint32_t tensorRank,
void* globalAddress,
const uint64_t* globalDim,
const uint64_t* globalStrides,
const uint32_t* boxDim,
const uint32_t* elementStrides,
CUtensorMapInterleave interleave,
CUtensorMapSwizzle swizzle,
CUtensorMapL2promotion l2Promotion,
CUtensorMapFloatOOBfill oobFill
)
cdef CUresult cuTensorMapEncodeTiledDummy(
void* tensorMap,
CUtensorMapDataType tensorDataType,
uint32_t tensorRank,
void* globalAddress,
const uint64_t* globalDim,
const uint64_t* globalStrides,
const uint32_t* boxDim,
const uint32_t* elementStrides,
CUtensorMapInterleave interleave,
CUtensorMapSwizzle swizzle,
CUtensorMapL2promotion l2Promotion,
CUtensorMapFloatOOBfill oobFill
):
return 0 # Simulate success for testing
cdef class TensorMap:
cdef CUtensorMap _desc
def __cinit__(self):
memset(&self._desc, 0, sizeof(CUtensorMap))
def create_tensor_map(
uint32_t data_type,
uint32_t rank,
uint64_t global_address,
list global_dim,
list global_stride,
list box_dim,
list element_strides,
uint32_t interleave,
uint32_t swizzle,
uint32_t l2_promotion,
uint32_t oob_fill
):
"""Create a TMA descriptor - minimal Cython implementation."""
cdef TensorMap result = TensorMap()
# Convert Python lists to C arrays
cdef uint64_t c_global_dim[5]
cdef uint64_t c_global_stride[5]
cdef uint32_t c_box_dim[5]
cdef uint32_t c_element_strides[5]
cdef int i
for i in range(len(global_dim)):
c_global_dim[i] = global_dim[i]
for i in range(len(global_stride)):
c_global_stride[i] = global_stride[i]
for i in range(len(box_dim)):
c_box_dim[i] = box_dim[i]
for i in range(len(element_strides)):
c_element_strides[i] = element_strides[i]
cdef CUresult err = cuTensorMapEncodeTiledDummy(
<void *>&result._desc,
<CUtensorMapDataType>data_type,
rank,
<void*>global_address,
c_global_dim,
c_global_stride,
c_box_dim,
c_element_strides,
<CUtensorMapInterleave>interleave,
<CUtensorMapSwizzle>swizzle,
<CUtensorMapL2promotion>l2_promotion,
<CUtensorMapFloatOOBfill>oob_fill
)
err = 0
if err != 0:
raise RuntimeError(f"cuTensorMapEncodeTiled failed: {err}")
return result
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment