This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <iostream> | |
| #include <vector> | |
| #include <cuda_runtime.h> | |
| #define cudaCheck(ans) { gpuAssert((ans), __FILE__, __LINE__); } | |
| inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) { | |
| if (code != cudaSuccess) { | |
| fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); | |
| if (abort) exit(code); | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import torch | |
| from torch.utils.cpp_extension import load_inline | |
| src = r""" | |
| #include <torch/extension.h> | |
| #include <ATen/cuda/CUDAContext.h> | |
| #include <cuda_runtime.h> | |
| // Return SM count for a specific device (or current device if device_index < 0) | |
| int64_t num_sms(int64_t device_index = -1) { |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| INFO 09-09 23:50:30 [__init__.py:216] Automatically detected platform cuda. | |
| /usr/local/lib/python3.12/dist-packages/pytest_asyncio/plugin.py:208: PytestDeprecationWarning: The configuration option "asyncio_default_fixture_loop_scope" is unset. | |
| The event loop scope for asynchronous fixtures will default to the fixture caching scope. Future versions of pytest-asyncio will default the loop scope for asynchronous fixtures to function scope. Set the default fixture loop scope explicitly in order to avoid unexpected behavior in the future. Valid fixture loop scopes are: "function", "class", "module", "package", "session" | |
| warnings.warn(PytestDeprecationWarning(_DEFAULT_FIXTURE_LOOP_SCOPE_UNSET)) | |
| ============================================================= test session starts ============================================================= | |
| platform linux -- Python 3.12.11, pytest-8.3.5, pluggy-1.5.0 -- /usr/bin/python3 | |
| cachedir: .pytest_cache | |
| hypothesis profile 'default' -> database=DirectoryBasedExampleDatabase(Pos |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <cuda.h> | |
| #include <iostream> | |
| #include <cassert> | |
| #define CHECK_CUDA(call) \ | |
| do { \ | |
| CUresult err = call; \ | |
| if (err != CUDA_SUCCESS) { \ | |
| const char* errStr; \ | |
| cuGetErrorString(err, &errStr); \ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import torch | |
| from torch.utils.cpp_extension import load_inline | |
| src = { | |
| "cuda": r""" | |
| #include <cuda_runtime.h> | |
| #include <torch/all.h> | |
| #include <c10/cuda/CUDAStream.h> | |
| __global__ void computation_kernel(unsigned long long total_nanosec) { |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <iostream> | |
| #include <cuda_runtime.h> | |
| #include <cuda.h> | |
| // Define the kernel with illegal memory access | |
| __global__ void illegalWildPointerKernel(int* data, int size) { | |
| int idx = threadIdx.x + blockIdx.x * blockDim.x; | |
| __nanosleep(1000000000ULL); // Sleep for 1 second | |
| int* wild_pointer = (int*)0x100; | |
| if (idx == 0) { |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import torch | |
| import torch.distributed as dist | |
| use_nccl = False | |
| dist.init_process_group(backend="nccl" if use_nccl else "gloo") | |
| rank = dist.get_rank() | |
| torch.cuda.set_device(rank % 8) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import torch.distributed as dist | |
| import torch | |
| import time | |
| dist.init_process_group(backend="nccl") | |
| rank = dist.get_rank() | |
| torch.cuda.set_device(rank) | |
| N_warmup = 10 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import torch | |
| import torch.nn as nn | |
| # an Embedding module containing 10 tensors of size 3 | |
| embedding = nn.Embedding(10, 3) | |
| embedding.weight.requires_grad_(False) | |
| # a batch of 4 indices | |
| input = torch.LongTensor([1, 2, 4, 5]) | |
| output = embedding(input) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| from typing import List | |
| # os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" | |
| import torch | |
| import torch.distributed as dist | |
| dist.init_process_group(backend="gloo") | |
| rank = local_rank = dist.get_rank() | |
| world_size = dist.get_world_size() | |
| torch.cuda.set_device(local_rank) |
NewerOlder