Skip to content

Instantly share code, notes, and snippets.

@cgmb
Last active April 18, 2025 01:57
Show Gist options
  • Select an option

  • Save cgmb/6ae0d118bf357fc4576a7568b85e1c45 to your computer and use it in GitHub Desktop.

Select an option

Save cgmb/6ae0d118bf357fc4576a7568b85e1c45 to your computer and use it in GitHub Desktop.
Setup ROCm on a G4ad instance

How to setup ROCm 5.4.3 on an Ubuntu 22.04 G4ad instance

Install ROCm 5.4.3

sudo apt-get -y update
sudo apt-get -y upgrade
sudo apt-get -y install linux-modules-extra-aws
wget https://repo.radeon.com/amdgpu-install/5.4.3/ubuntu/jammy/amdgpu-install_5.4.50403-1_all.deb
sudo apt-get -y install ./amdgpu-install_5.4.50403-1_all.deb
sudo amdgpu-install --usecase=rocmdev
sudo usermod -a -G video,render ubuntu
sudo reboot

You can verify that this installation has been successful by running rocminfo and checking that gfx1011 is listed as one of the agents.

Build and run a sample program

apt-get install -y cmake build-essential g++-12
CXX=/opt/rocm/bin/hipcc cmake -S. -Bbuild -DAMDGPU_TARGETS=gfx1011 -DCMAKE_PREFIX_PATH=/opt/rocm
make -C build
./build/example

Note that AMD does not build the ROCm math libraries for gfx1011 when preparing their official packages, so if you want to use a library like rocSPARSE (whether directly in C++ or indirectly through a framework like PyTorch), you will have to build it from source yourself. The Spack package manager supports building for specific amdgpu architectures and is a useful tool for this purpose.

cmake_minimum_required(VERSION 3.16)
project(example LANGUAGES CXX)
find_package(hip REQUIRED)
add_executable(example main.cpp)
target_link_libraries(example PRIVATE hip::device)
#include <stdio.h>
#include <stdlib.h>
#include <hip/hip_runtime.h>
#define CHECK_HIP(expr) do { \
hipError_t result = (expr); \
if (result != hipSuccess) { \
fprintf(stderr, "%s:%d: %s (%d)\n", \
__FILE__, __LINE__, \
hipGetErrorString(result), result); \
exit(EXIT_FAILURE); \
} \
} while(0)
__global__ void sq_arr(float *arr, int n) {
int tid = blockDim.x*blockIdx.x + threadIdx.x;
if (tid < n) {
arr[tid] = arr[tid] * arr[tid];
}
}
int main() {
enum { N = 5 };
float hArr[N] = { 1, 2, 3, 4, 5 };
float *dArr;
CHECK_HIP(hipMalloc(&dArr, sizeof(float) * N));
CHECK_HIP(hipMemcpy(dArr, hArr, sizeof(float) * N, hipMemcpyHostToDevice));
sq_arr<<<dim3(1), dim3(32,1,1), 0, 0>>>(dArr, N);
CHECK_HIP(hipMemcpy(hArr, dArr, sizeof(float) * N, hipMemcpyDeviceToHost));
for (int i = 0; i < N; ++i) {
printf("%f\n", hArr[i]);
}
CHECK_HIP(hipFree(dArr));
return 0;
}
@ggouaillardet
Copy link

Thanks for the quick reply!

I will give it a shot once I can get access to such an instance, and share my findings.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment