-
-
Save youkaichao/8f87555bdeaaf68f4492b0dc96fbd206 to your computer and use it in GitHub Desktop.
| import os | |
| from typing import List | |
| # os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" | |
| import torch | |
| import torch.distributed as dist | |
| dist.init_process_group(backend="gloo") | |
| rank = local_rank = dist.get_rank() | |
| world_size = dist.get_world_size() | |
| torch.cuda.set_device(local_rank) | |
| def share_tensor(A: torch.Tensor, group=None) -> List[torch.Tensor]: | |
| from torch.multiprocessing.reductions import reduce_tensor | |
| A_meta = reduce_tensor(A) | |
| tensor_metas = [None] * world_size | |
| dist.all_gather_object(tensor_metas, A_meta, group=group) | |
| rank = dist.get_rank(group) | |
| all_tensors = [] | |
| for i, obj in enumerate(tensor_metas): | |
| func = obj[0] | |
| args = list(obj[1]) | |
| args[6] = A.device.index | |
| if i != rank: | |
| all_tensors.append(func(*args)) | |
| else: | |
| all_tensors.append(A) | |
| return all_tensors | |
| A = torch.ones((10,), device=local_rank) * rank | |
| all_tensors = share_tensor(A) | |
| dist.barrier() | |
| torch.cuda.synchronize() | |
| if rank == 0: | |
| for x in all_tensors: | |
| x.zero_() | |
| dist.barrier() | |
| torch.cuda.synchronize() | |
| for i, x in enumerate(all_tensors): | |
| print(f"{rank=}, {i=}, {x=}") |
IPC of expandable segment is introduced in pytorch 2.5. However, I find that some linux os does not allow this feature.
Example test code to manually test the IPC functionality:
// sender.cpp
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <cstring>
#include <unistd.h>
#include <sys/syscall.h>
// Define syscall numbers if not available
#ifndef SYS_pidfd_open
#define SYS_pidfd_open 434
#endif
struct ShareHeader {
pid_t pid;
size_t segment_size;
size_t num_handles;
};
// Helper function to get CUDA error string
const char* getCudaErrorString(CUresult error) {
const char* errorString;
cuGetErrorString(error, &errorString);
return errorString;
}
int main() {
// Initialize CUDA
CUresult result = cuInit(0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to initialize CUDA: " << getCudaErrorString(result) << std::endl;
return 1;
}
// Get CUDA device
CUdevice device;
result = cuDeviceGet(&device, 0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to get CUDA device: " << getCudaErrorString(result) << std::endl;
return 1;
}
// Create CUDA context
CUcontext context;
result = cuCtxCreate(&context, 0, device);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to create CUDA context: " << getCudaErrorString(result) << std::endl;
return 1;
}
// Allocate memory using VMM API
const size_t size = 20 * 1024 * 1024; // 20MB
CUmemGenericAllocationHandle handle;
// Set up memory allocation properties
CUmemAllocationProp prop = {};
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
prop.location.id = 0; // Use device 0
prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; // Specify handle type for export
prop.win32HandleMetaData = nullptr;
// Get the minimum granularity supported for allocation
size_t granularity = 0;
result = cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to get allocation granularity: " << getCudaErrorString(result) << std::endl;
return 1;
}
// Ensure size is a multiple of granularity
if (size % granularity) {
std::cerr << "Allocation size is not a multiple of minimum supported granularity" << std::endl;
return 1;
}
std::cout << "Creating memory handle with size: " << size << " bytes" << std::endl;
result = cuMemCreate(&handle, size, &prop, 0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to create memory handle: " << getCudaErrorString(result) << std::endl;
return 1;
}
std::cout << "Successfully created memory handle" << std::endl;
// Reserve address range
CUdeviceptr ptr;
std::cout << "Reserving address range" << std::endl;
result = cuMemAddressReserve(&ptr, size, 0, 0, 0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to reserve address range: " << getCudaErrorString(result) << std::endl;
cuMemRelease(handle);
return 1;
}
std::cout << "Successfully reserved address range at: " << ptr << std::endl;
// Map the memory
std::cout << "Mapping memory" << std::endl;
result = cuMemMap(ptr, size, 0, handle, 0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to map memory: " << getCudaErrorString(result) << std::endl;
cuMemAddressFree(ptr, size);
cuMemRelease(handle);
return 1;
}
std::cout << "Successfully mapped memory" << std::endl;
// Set access properties
CUmemAccessDesc accessDesc = {};
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
accessDesc.location.id = 0; // Use device 0
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
std::cout << "Setting memory access properties" << std::endl;
result = cuMemSetAccess(ptr, size, &accessDesc, 1);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to set memory access: " << getCudaErrorString(result) << std::endl;
cuMemUnmap(ptr, size);
cuMemAddressFree(ptr, size);
cuMemRelease(handle);
return 1;
}
std::cout << "Successfully set memory access properties" << std::endl;
// Export handle to file descriptor
int fd = 0;
std::cout << "Exporting handle to file descriptor" << std::endl;
result = cuMemExportToShareableHandle(&fd, handle, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to export handle: " << getCudaErrorString(result) << std::endl;
std::cerr << "Handle value: " << handle << std::endl;
std::cerr << "Handle type: CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR" << std::endl;
cuMemUnmap(ptr, size);
cuMemAddressFree(ptr, size);
cuMemRelease(handle);
return 1;
}
std::cout << "Successfully exported handle to fd: " << fd << std::endl;
// Write to file
std::ofstream outfile("data.bin", std::ios::binary);
if (!outfile) {
std::cerr << "Failed to open output file: " << strerror(errno) << std::endl;
cuMemUnmap(ptr, size);
cuMemAddressFree(ptr, size);
cuMemRelease(handle);
return 1;
}
// Write header
ShareHeader header{getpid(), size, 1};
outfile.write(reinterpret_cast<const char*>(&header), sizeof(ShareHeader));
// Write file descriptor
outfile.write(reinterpret_cast<const char*>(&fd), sizeof(int));
outfile.close();
std::cout << "Data written to data.bin. Press Enter to continue..." << std::endl;
std::cin.get();
// Cleanup
result = cuMemUnmap(ptr, size);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to unmap memory: " << getCudaErrorString(result) << std::endl;
}
result = cuMemAddressFree(ptr, size);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to free address range: " << getCudaErrorString(result) << std::endl;
}
result = cuMemRelease(handle);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to release memory handle: " << getCudaErrorString(result) << std::endl;
}
result = cuCtxDestroy(context);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to destroy CUDA context: " << getCudaErrorString(result) << std::endl;
}
return 0;
} // receiver.cpp
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <cstring>
#include <unistd.h>
#include <sys/syscall.h>
// Define syscall numbers if not available
#ifndef SYS_pidfd_open
#define SYS_pidfd_open 434
#endif
#ifndef SYS_pidfd_getfd
#define SYS_pidfd_getfd 438
#endif
struct ShareHeader {
pid_t pid;
size_t segment_size;
size_t num_handles;
};
// Helper function to get CUDA error string
const char* getCudaErrorString(CUresult error) {
const char* errorString;
cuGetErrorString(error, &errorString);
return errorString;
}
int main() {
// Initialize CUDA
CUresult result = cuInit(0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to initialize CUDA: " << getCudaErrorString(result) << std::endl;
return 1;
}
// Get CUDA device
CUdevice device;
result = cuDeviceGet(&device, 0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to get CUDA device: " << getCudaErrorString(result) << std::endl;
return 1;
}
// Create CUDA context
CUcontext context;
result = cuCtxCreate(&context, 0, device);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to create CUDA context: " << getCudaErrorString(result) << std::endl;
return 1;
}
// Read from file
std::ifstream infile("data.bin", std::ios::binary);
if (!infile) {
std::cerr << "Failed to open input file: " << strerror(errno) << std::endl;
return 1;
}
// Read header
ShareHeader header;
infile.read(reinterpret_cast<char*>(&header), sizeof(ShareHeader));
// Open pidfd
auto pidfd = syscall(SYS_pidfd_open, header.pid, 0);
if (pidfd == -1) {
std::cerr << "pidfd_open failed: " << strerror(errno) << std::endl;
return 1;
}
// Read file descriptor
int fd = 0;
infile.read(reinterpret_cast<char*>(&fd), sizeof(int));
infile.close();
// Get our own file descriptor
auto myfd = syscall(SYS_pidfd_getfd, pidfd, fd, 0);
if (myfd == -1) {
std::cerr << "pidfd_getfd failed: " << strerror(errno) << std::endl;
close(pidfd);
return 1;
}
// Import handle
CUmemGenericAllocationHandle handle;
result = cuMemImportFromShareableHandle(
&handle,
reinterpret_cast<void*>(static_cast<uintptr_t>(myfd)),
CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to import handle: " << getCudaErrorString(result) << std::endl;
close(myfd);
close(pidfd);
return 1;
}
// Reserve address range
CUdeviceptr ptr;
result = cuMemAddressReserve(&ptr, header.segment_size, 0, 0, 0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to reserve address range: " << getCudaErrorString(result) << std::endl;
close(myfd);
close(pidfd);
return 1;
}
// Map the memory
result = cuMemMap(ptr, header.segment_size, 0, handle, 0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to map memory: " << getCudaErrorString(result) << std::endl;
cuMemAddressFree(ptr, header.segment_size);
close(myfd);
close(pidfd);
return 1;
}
// Set access properties
CUmemAccessDesc accessDesc = {};
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
accessDesc.location.id = 0; // Use device 0
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
result = cuMemSetAccess(ptr, header.segment_size, &accessDesc, 1);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to set memory access: " << getCudaErrorString(result) << std::endl;
cuMemUnmap(ptr, header.segment_size);
cuMemAddressFree(ptr, header.segment_size);
close(myfd);
close(pidfd);
return 1;
}
std::cout << "Successfully imported and mapped memory at address: " << ptr << std::endl;
std::cout << "Press Enter to continue..." << std::endl;
std::cin.get();
// Cleanup
result = cuMemUnmap(ptr, header.segment_size);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to unmap memory: " << getCudaErrorString(result) << std::endl;
}
result = cuMemAddressFree(ptr, header.segment_size);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to free address range: " << getCudaErrorString(result) << std::endl;
}
result = cuMemRelease(handle);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to release memory handle: " << getCudaErrorString(result) << std::endl;
}
close(myfd);
close(pidfd);
result = cuCtxDestroy(context);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to destroy CUDA context: " << getCudaErrorString(result) << std::endl;
}
return 0;
} Compile with
$ nvcc receiver.cpp -o receiver -lcuda
$ nvcc sender.cpp -o sender -lcudaIn one shell, execute ./sender , and in another shell, execute ./receiver .
In some nodes, it succeeds; but in some nodes, it does not.
When it fails with Operation not permitted, execute the receiver with sudo access works.
it seems, if the sender process let any process ptrace it, then it works.
add the following code to sender:
#include <sys/prctl.h>
prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY);or python equivalent:
import ctypes
# Constants from prctl.h
PR_SET_PTRACER = 0x59616d61
PR_SET_PTRACER_ANY = -1 # Allow any process with the same UID to ptrace
libc = ctypes.CDLL("libc.so.6", use_errno=True)
result = libc.prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0)
if result != 0:
errno = ctypes.get_errno()
raise OSError(errno, f"prctl(PR_SET_PTRACER, ANY) failed: {ctypes.cast(libc.strerror(errno), ctypes.c_char_p).value.decode()}")
else:
print("✅ Allowed ptrace from any same-UID process (PR_SET_PTRACER_ANY)")then it works.
to use the fabric handle, we should follow https://docs.nvidia.com/multi-node-nvlink-systems/imex-guide/imexchannels.html to create imex channels (read access is required), and then here is a working example of ipc through the fabric handle:
// sender.cpp
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <cstring>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/prctl.h>
// Define syscall numbers if not available
#ifndef SYS_pidfd_open
#define SYS_pidfd_open 434
#endif
// Helper function to get CUDA error string
const char* getCudaErrorString(CUresult error) {
const char* errorString;
cuGetErrorString(error, &errorString);
return errorString;
}
int main() {
prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY);
// Initialize CUDA
CUresult result = cuInit(0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to initialize CUDA: " << getCudaErrorString(result) << std::endl;
return 1;
}
// Get CUDA device
CUdevice device;
result = cuDeviceGet(&device, 0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to get CUDA device: " << getCudaErrorString(result) << std::endl;
return 1;
}
// Create CUDA context
CUcontext context;
result = cuCtxCreate(&context, 0, device);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to create CUDA context: " << getCudaErrorString(result) << std::endl;
return 1;
}
// Allocate memory using VMM API
const size_t size = 20 * 1024 * 1024; // 20MB
CUmemGenericAllocationHandle handle;
// Set up memory allocation properties
CUmemAllocationProp prop = {};
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
prop.location.id = 0; // Use device 0
prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; // Use fabric handle type for IPC
prop.win32HandleMetaData = nullptr;
// Get the minimum granularity supported for allocation
size_t granularity = 0;
result = cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to get allocation granularity: " << getCudaErrorString(result) << std::endl;
return 1;
}
// Ensure size is a multiple of granularity
if (size % granularity) {
std::cerr << "Allocation size is not a multiple of minimum supported granularity" << std::endl;
return 1;
}
std::cout << "Creating memory handle with size: " << size << " bytes" << std::endl;
result = cuMemCreate(&handle, size, &prop, 0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to create memory handle: " << getCudaErrorString(result) << std::endl;
return 1;
}
std::cout << "Successfully created memory handle" << std::endl;
// Reserve address range
CUdeviceptr ptr;
std::cout << "Reserving address range" << std::endl;
result = cuMemAddressReserve(&ptr, size, 0, 0, 0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to reserve address range: " << getCudaErrorString(result) << std::endl;
cuMemRelease(handle);
return 1;
}
std::cout << "Successfully reserved address range at: " << ptr << std::endl;
// Map the memory
std::cout << "Mapping memory" << std::endl;
result = cuMemMap(ptr, size, 0, handle, 0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to map memory: " << getCudaErrorString(result) << std::endl;
cuMemAddressFree(ptr, size);
cuMemRelease(handle);
return 1;
}
std::cout << "Successfully mapped memory" << std::endl;
// Set access properties
CUmemAccessDesc accessDesc = {};
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
accessDesc.location.id = 0; // Use device 0
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
std::cout << "Setting memory access properties" << std::endl;
result = cuMemSetAccess(ptr, size, &accessDesc, 1);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to set memory access: " << getCudaErrorString(result) << std::endl;
cuMemUnmap(ptr, size);
cuMemAddressFree(ptr, size);
cuMemRelease(handle);
return 1;
}
std::cout << "Successfully set memory access properties" << std::endl;
// Export handle to fabric handle
CUmemFabricHandle_v1 fabricHandle;
std::cout << "Exporting handle to fabric handle" << std::endl;
std::cout << "Original handle value: " << handle << std::endl;
std::cout << "Allocation size: " << size << " bytes" << std::endl;
result = cuMemExportToShareableHandle(&fabricHandle, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to export handle: " << getCudaErrorString(result) << std::endl;
std::cerr << "Handle value: " << handle << std::endl;
std::cerr << "Handle type: CU_MEM_HANDLE_TYPE_FABRIC" << std::endl;
cuMemUnmap(ptr, size);
cuMemAddressFree(ptr, size);
cuMemRelease(handle);
return 1;
}
std::cout << "Successfully exported handle to fabric handle" << std::endl;
std::cout << "Fabric handle value: " << reinterpret_cast<uintptr_t>(&fabricHandle) << std::endl;
// Write to file
std::ofstream outfile("data.bin", std::ios::binary);
if (!outfile) {
std::cerr << "Failed to open output file: " << strerror(errno) << std::endl;
cuMemUnmap(ptr, size);
cuMemAddressFree(ptr, size);
cuMemRelease(handle);
return 1;
}
// Write 8-byte size header
outfile.write(reinterpret_cast<const char*>(&size), 8);
// Write fabric handle
outfile.write(reinterpret_cast<const char*>(&fabricHandle), sizeof(CUmemFabricHandle_v1));
outfile.close();
std::cout << "Data written to data.bin. Press Enter to continue..." << std::endl;
std::cin.get();
// Cleanup
result = cuMemUnmap(ptr, size);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to unmap memory: " << getCudaErrorString(result) << std::endl;
}
result = cuMemAddressFree(ptr, size);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to free address range: " << getCudaErrorString(result) << std::endl;
}
result = cuMemRelease(handle);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to release memory handle: " << getCudaErrorString(result) << std::endl;
}
result = cuCtxDestroy(context);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to destroy CUDA context: " << getCudaErrorString(result) << std::endl;
}
return 0;
} // receiver.cpp
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <cstring>
// Helper function to get CUDA error string
const char* getCudaErrorString(CUresult error) {
const char* errorString;
cuGetErrorString(error, &errorString);
return errorString;
}
int main() {
// Initialize CUDA
CUresult result = cuInit(0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to initialize CUDA: " << getCudaErrorString(result) << std::endl;
return 1;
}
// Get CUDA device
CUdevice device;
result = cuDeviceGet(&device, 0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to get CUDA device: " << getCudaErrorString(result) << std::endl;
return 1;
}
// Create CUDA context
CUcontext context;
result = cuCtxCreate(&context, 0, device);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to create CUDA context: " << getCudaErrorString(result) << std::endl;
return 1;
}
// Read from file
std::ifstream infile("data.bin", std::ios::binary);
if (!infile) {
std::cerr << "Failed to open input file: " << strerror(errno) << std::endl;
return 1;
}
// Read 8-byte size header
size_t size;
infile.read(reinterpret_cast<char*>(&size), 8);
std::cout << "Read allocation size: " << size << " bytes" << std::endl;
// Read fabric handle
CUmemFabricHandle_v1 fabricHandle;
infile.read(reinterpret_cast<char*>(&fabricHandle), sizeof(CUmemFabricHandle_v1));
std::cout << "Read fabric handle value: " << reinterpret_cast<uintptr_t>(&fabricHandle) << std::endl;
infile.close();
// Import handle
CUmemGenericAllocationHandle handle;
std::cout << "Importing handle..." << std::endl;
result = cuMemImportFromShareableHandle(
&handle,
reinterpret_cast<void*>(&fabricHandle),
CU_MEM_HANDLE_TYPE_FABRIC
);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to import handle: " << getCudaErrorString(result) << std::endl;
return 1;
}
std::cout << "Successfully imported handle: " << handle << std::endl;
// Reserve address range
CUdeviceptr ptr;
std::cout << "Reserving address range of size: " << size << std::endl;
result = cuMemAddressReserve(&ptr, size, 0, 0, 0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to reserve address range: " << getCudaErrorString(result) << std::endl;
return 1;
}
std::cout << "Successfully reserved address range at: " << ptr << std::endl;
// Map the memory
std::cout << "Mapping memory..." << std::endl;
result = cuMemMap(ptr, size, 0, handle, 0);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to map memory: " << getCudaErrorString(result) << std::endl;
cuMemAddressFree(ptr, size);
return 1;
}
std::cout << "Successfully mapped memory" << std::endl;
// Set access properties
CUmemAccessDesc accessDesc = {};
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
accessDesc.location.id = 0; // Use device 0
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
std::cout << "Setting memory access properties..." << std::endl;
result = cuMemSetAccess(ptr, size, &accessDesc, 1);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to set memory access: " << getCudaErrorString(result) << std::endl;
cuMemUnmap(ptr, size);
cuMemAddressFree(ptr, size);
return 1;
}
std::cout << "Successfully set memory access properties" << std::endl;
std::cout << "Successfully imported and mapped memory at address: " << ptr << std::endl;
std::cout << "Press Enter to continue..." << std::endl;
std::cin.get();
// Cleanup
result = cuMemUnmap(ptr, size);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to unmap memory: " << getCudaErrorString(result) << std::endl;
}
result = cuMemAddressFree(ptr, size);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to free address range: " << getCudaErrorString(result) << std::endl;
}
result = cuMemRelease(handle);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to release memory handle: " << getCudaErrorString(result) << std::endl;
}
result = cuCtxDestroy(context);
if (result != CUDA_SUCCESS) {
std::cerr << "Failed to destroy CUDA context: " << getCudaErrorString(result) << std::endl;
}
return 0;
} Compile with
$ nvcc receiver.cpp -o receiver -lcuda
$ nvcc sender.cpp -o sender -lcudaIn one shell, execute ./sender , and in another shell, execute ./receiver .
We can see:
(py310) ➜ test_pidfd ./sender
Creating memory handle with size: 20971520 bytes
Successfully created memory handle
Reserving address range
Successfully reserved address range at: 139849545809920
Mapping memory
Successfully mapped memory
Setting memory access properties
Successfully set memory access properties
Exporting handle to fabric handle
Original handle value: 94556404496384
Allocation size: 20971520 bytes
Successfully exported handle to fabric handle
Fabric handle value: 140735970184032
Data written to data.bin. Press Enter to continue...
(py310) ➜ test_pidfd ./receiver
Read allocation size: 20971520 bytes
Read fabric handle value: 140734345409808
Importing handle...
Successfully imported handle: 94798849087488
Reserving address range of size: 20971520
Successfully reserved address range at: 140063992184832
Mapping memory...
Successfully mapped memory
Setting memory access properties...
Successfully set memory access properties
Successfully imported and mapped memory at address: 140063992184832
Press Enter to continue...
The data is 72 bytes, 8 byte header (for size) and 64 byte for the fabric handle.
running on
2.6.0.dev20241112+cu124, still get the same errorRuntimeError: pidfd_getfd: Operation not permitted.