youkaichao youkaichao

## cupti_driver_stream_trace_preload.c
// cupti_driver_stream_trace_preload.c
// LD_PRELOAD interposer that:
//   - intercepts cudaMalloc
//   - on first successful cudaMalloc, enables CUPTI Driver API callbacks for:
//       * cuStreamCreate      (log at API EXIT)
//       * cuStreamSynchronize (log at API ENTER)
// compile: c++ -O2 -fPIC -shared cupti_driver_stream_trace_preload.c -o cupti_driver_stream_load.so -I/usr/local/cuda-12.8/include/ -L/usr/local/cuda-12.8/lib64/ -lcupti -lcuda
// important: only intercepts cudaMalloc, a runtime API. cannot intercept driver APIs, as the library depends on driver APIs.
// run: LD_PRELOAD=/data/youkaichao/vllm/cupti_driver_stream_load.so python test.py

## streams.cu
#include <cuda_runtime.h>

#include <cstdio>
#include <cstdlib>
#include <unordered_set>
#include <vector>
#include <cstdint>

static void check(cudaError_t e, const char* msg) {
  if (e != cudaSuccess) {

## test.cu
#include <iostream>
#include <vector>
#include <cuda_runtime.h>

#define cudaCheck(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
    if (code != cudaSuccess) {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }

## a.py
import torch
from torch.utils.cpp_extension import load_inline

src = r"""
#include <torch/extension.h>
#include <ATen/cuda/CUDAContext.h>
#include <cuda_runtime.h>

// Return SM count for a specific device (or current device if device_index < 0)
int64_t num_sms(int64_t device_index = -1) {

## nccl_error.log
INFO 09-09 23:50:30 [__init__.py:216] Automatically detected platform cuda.
/usr/local/lib/python3.12/dist-packages/pytest_asyncio/plugin.py:208: PytestDeprecationWarning: The configuration option "asyncio_default_fixture_loop_scope" is unset.
The event loop scope for asynchronous fixtures will default to the fixture caching scope. Future versions of pytest-asyncio will default the loop scope for asynchronous fixtures to function scope. Set the default fixture loop scope explicitly in order to avoid unexpected behavior in the future. Valid fixture loop scopes are: "function", "class", "module", "package", "session"

  warnings.warn(PytestDeprecationWarning(_DEFAULT_FIXTURE_LOOP_SCOPE_UNSET))
============================================================= test session starts =============================================================
platform linux -- Python 3.12.11, pytest-8.3.5, pluggy-1.5.0 -- /usr/bin/python3
cachedir: .pytest_cache
hypothesis profile 'default' -> database=DirectoryBasedExampleDatabase(Pos

## test2.cu
#include <cuda.h>
#include <iostream>
#include <cassert>

#define CHECK_CUDA(call) \
    do { \
        CUresult err = call; \
        if (err != CUDA_SUCCESS) { \
            const char* errStr; \
            cuGetErrorString(err, &errStr); \

## microbatch_overlapping.py
import torch
from torch.utils.cpp_extension import load_inline

src = {
    "cuda": r"""
#include <cuda_runtime.h>
#include <torch/all.h>
#include <c10/cuda/CUDAStream.h>

__global__ void computation_kernel(unsigned long long total_nanosec) {

## test1.cu
#include <iostream>
#include <cuda_runtime.h>
#include <cuda.h>

// Define the kernel with illegal memory access
__global__ void illegalWildPointerKernel(int* data, int size) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    __nanosleep(1000000000ULL);  // Sleep for 1 second
    int* wild_pointer = (int*)0x100;
    if (idx == 0) {

## test.py
import torch
import torch.distributed as dist

use_nccl = False

dist.init_process_group(backend="nccl" if use_nccl else "gloo")

rank = dist.get_rank()
torch.cuda.set_device(rank % 8)

## test_pytorch.py
import torch.distributed as dist
import torch
import time

dist.init_process_group(backend="nccl")

rank = dist.get_rank()
torch.cuda.set_device(rank)

N_warmup = 10
	// cupti_driver_stream_trace_preload.c
	// LD_PRELOAD interposer that:
	// - intercepts cudaMalloc
	// - on first successful cudaMalloc, enables CUPTI Driver API callbacks for:
	// * cuStreamCreate (log at API EXIT)
	// * cuStreamSynchronize (log at API ENTER)
	// compile: c++ -O2 -fPIC -shared cupti_driver_stream_trace_preload.c -o cupti_driver_stream_load.so -I/usr/local/cuda-12.8/include/ -L/usr/local/cuda-12.8/lib64/ -lcupti -lcuda
	// important: only intercepts cudaMalloc, a runtime API. cannot intercept driver APIs, as the library depends on driver APIs.
	// run: LD_PRELOAD=/data/youkaichao/vllm/cupti_driver_stream_load.so python test.py
	#include <cuda_runtime.h>

	#include <cstdio>
	#include <cstdlib>
	#include <unordered_set>
	#include <vector>
	#include <cstdint>

	static void check(cudaError_t e, const char* msg) {
	if (e != cudaSuccess) {
	#include <iostream>
	#include <vector>
	#include <cuda_runtime.h>

	#define cudaCheck(ans) { gpuAssert((ans), __FILE__, __LINE__); }
	inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
	if (code != cudaSuccess) {
	fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
	if (abort) exit(code);
	}
	import torch
	from torch.utils.cpp_extension import load_inline

	src = r"""
	#include <torch/extension.h>
	#include <ATen/cuda/CUDAContext.h>
	#include <cuda_runtime.h>

	// Return SM count for a specific device (or current device if device_index < 0)
	int64_t num_sms(int64_t device_index = -1) {
	INFO 09-09 23:50:30 [__init__.py:216] Automatically detected platform cuda.
	/usr/local/lib/python3.12/dist-packages/pytest_asyncio/plugin.py:208: PytestDeprecationWarning: The configuration option "asyncio_default_fixture_loop_scope" is unset.
	The event loop scope for asynchronous fixtures will default to the fixture caching scope. Future versions of pytest-asyncio will default the loop scope for asynchronous fixtures to function scope. Set the default fixture loop scope explicitly in order to avoid unexpected behavior in the future. Valid fixture loop scopes are: "function", "class", "module", "package", "session"

	warnings.warn(PytestDeprecationWarning(_DEFAULT_FIXTURE_LOOP_SCOPE_UNSET))
	============================================================= test session starts =============================================================
	platform linux -- Python 3.12.11, pytest-8.3.5, pluggy-1.5.0 -- /usr/bin/python3
	cachedir: .pytest_cache
	hypothesis profile 'default' -> database=DirectoryBasedExampleDatabase(Pos
	#include <cuda.h>
	#include <iostream>
	#include <cassert>

	#define CHECK_CUDA(call) \
	do { \
	CUresult err = call; \
	if (err != CUDA_SUCCESS) { \
	const char* errStr; \
	cuGetErrorString(err, &errStr); \
	import torch
	from torch.utils.cpp_extension import load_inline

	src = {
	"cuda": r"""
	#include <cuda_runtime.h>
	#include <torch/all.h>
	#include <c10/cuda/CUDAStream.h>

	__global__ void computation_kernel(unsigned long long total_nanosec) {
	import torch
	import torch.distributed as dist

	use_nccl = False

	dist.init_process_group(backend="nccl" if use_nccl else "gloo")

	rank = dist.get_rank()
	torch.cuda.set_device(rank % 8)
	import torch.distributed as dist
	import torch
	import time

	dist.init_process_group(backend="nccl")

	rank = dist.get_rank()
	torch.cuda.set_device(rank)

	N_warmup = 10