Skip to content

Instantly share code, notes, and snippets.

View youkaichao's full-sized avatar
:octocat:
curious

youkaichao youkaichao

:octocat:
curious
View GitHub Profile
#include <iostream>
#include <vector>
#include <cuda_runtime.h>
#define cudaCheck(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
if (code != cudaSuccess) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
@youkaichao
youkaichao / a.py
Last active September 18, 2025 14:02
import torch
from torch.utils.cpp_extension import load_inline
src = r"""
#include <torch/extension.h>
#include <ATen/cuda/CUDAContext.h>
#include <cuda_runtime.h>
// Return SM count for a specific device (or current device if device_index < 0)
int64_t num_sms(int64_t device_index = -1) {
INFO 09-09 23:50:30 [__init__.py:216] Automatically detected platform cuda.
/usr/local/lib/python3.12/dist-packages/pytest_asyncio/plugin.py:208: PytestDeprecationWarning: The configuration option "asyncio_default_fixture_loop_scope" is unset.
The event loop scope for asynchronous fixtures will default to the fixture caching scope. Future versions of pytest-asyncio will default the loop scope for asynchronous fixtures to function scope. Set the default fixture loop scope explicitly in order to avoid unexpected behavior in the future. Valid fixture loop scopes are: "function", "class", "module", "package", "session"
warnings.warn(PytestDeprecationWarning(_DEFAULT_FIXTURE_LOOP_SCOPE_UNSET))
============================================================= test session starts =============================================================
platform linux -- Python 3.12.11, pytest-8.3.5, pluggy-1.5.0 -- /usr/bin/python3
cachedir: .pytest_cache
hypothesis profile 'default' -> database=DirectoryBasedExampleDatabase(Pos
#include <cuda.h>
#include <iostream>
#include <cassert>
#define CHECK_CUDA(call) \
do { \
CUresult err = call; \
if (err != CUDA_SUCCESS) { \
const char* errStr; \
cuGetErrorString(err, &errStr); \
import torch
from torch.utils.cpp_extension import load_inline
src = {
"cuda": r"""
#include <cuda_runtime.h>
#include <torch/all.h>
#include <c10/cuda/CUDAStream.h>
__global__ void computation_kernel(unsigned long long total_nanosec) {
@youkaichao
youkaichao / test1.cu
Created April 24, 2025 11:05
test1.cu
#include <iostream>
#include <cuda_runtime.h>
#include <cuda.h>
// Define the kernel with illegal memory access
__global__ void illegalWildPointerKernel(int* data, int size) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
__nanosleep(1000000000ULL); // Sleep for 1 second
int* wild_pointer = (int*)0x100;
if (idx == 0) {
@youkaichao
youkaichao / test.py
Created February 6, 2025 04:34
gloo v.s. nccl
import torch
import torch.distributed as dist
use_nccl = False
dist.init_process_group(backend="nccl" if use_nccl else "gloo")
rank = dist.get_rank()
torch.cuda.set_device(rank % 8)
@youkaichao
youkaichao / test_pytorch.py
Created January 3, 2025 03:05
cmp shm broadcast and pytorch broadcast object list
import torch.distributed as dist
import torch
import time
dist.init_process_group(backend="nccl")
rank = dist.get_rank()
torch.cuda.set_device(rank)
N_warmup = 10
@youkaichao
youkaichao / embedding.py
Created November 6, 2024 20:28
inplace embedding
import torch
import torch.nn as nn
# an Embedding module containing 10 tensors of size 3
embedding = nn.Embedding(10, 3)
embedding.weight.requires_grad_(False)
# a batch of 4 indices
input = torch.LongTensor([1, 2, 4, 5])
output = embedding(input)
@youkaichao
youkaichao / ipc.py
Created November 5, 2024 00:06
cuda ipc
import os
from typing import List
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
import torch
import torch.distributed as dist
dist.init_process_group(backend="gloo")
rank = local_rank = dist.get_rank()
world_size = dist.get_world_size()
torch.cuda.set_device(local_rank)