Skip to content

Instantly share code, notes, and snippets.

@maziyarpanahi
Created November 24, 2025 14:24
Show Gist options
  • Select an option

  • Save maziyarpanahi/ffbeffdce95dbfc9dadbb7adf1ce3ead to your computer and use it in GitHub Desktop.

Select an option

Save maziyarpanahi/ffbeffdce95dbfc9dadbb7adf1ce3ead to your computer and use it in GitHub Desktop.
#!/bin/bash
# NVIDIA Driver and CUDA Installation Script for RHEL 9.4 with H100 GPUs
# This script installs the latest stable NVIDIA driver and CUDA toolkit
set -e # Exit on error
echo "=========================================="
echo "NVIDIA H100 Driver & CUDA Installation"
echo "OS: Red Hat Enterprise Linux 9.4"
echo "Kernel: $(uname -r)"
echo "GPUs: 8x NVIDIA H100L"
echo "=========================================="
echo ""
# Check if running as root
if [ "$EUID" -ne 0 ]; then
echo "ERROR: This script must be run as root (use sudo)"
exit 1
fi
# Step 1: Install kernel development packages matching current kernel
echo "[1/7] Installing kernel development packages..."
KERNEL_VERSION=$(uname -r)
dnf install -y kernel-devel-${KERNEL_VERSION} kernel-headers-${KERNEL_VERSION} || {
echo "Warning: Exact kernel-devel not found, installing latest..."
dnf install -y kernel-devel kernel-headers
}
# Step 2: Install build tools and dependencies
echo "[2/7] Installing build tools and dependencies..."
dnf install -y gcc make dkms acpid libglvnd-glx libglvnd-opengl libglvnd-devel pkgconfig
# Step 3: Disable nouveau driver (NVIDIA's open-source alternative)
echo "[3/7] Disabling nouveau driver..."
if [ -f /etc/modprobe.d/blacklist-nouveau.conf ]; then
echo "blacklist-nouveau.conf already exists"
else
echo "blacklist nouveau" > /etc/modprobe.d/blacklist-nouveau.conf
echo "options nouveau modeset=0" >> /etc/modprobe.d/blacklist-nouveau.conf
fi
# Backup initramfs
if [ -f /boot/initramfs-${KERNEL_VERSION}.img ]; then
dracut --force
fi
# Step 4: Add NVIDIA CUDA repository
echo "[4/7] Adding NVIDIA CUDA repository..."
# For RHEL 9, we'll use NVIDIA's official repository
cat > /etc/yum.repos.d/cuda-rhel9.repo << 'EOF'
[cuda-rhel9]
name=CUDA for RHEL 9
baseurl=https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64
enabled=1
gpgcheck=1
gpgkey=https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/D42D0685.pub
EOF
# Import NVIDIA GPG key
rpm --import https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/D42D0685.pub
# Step 5: Clean and update dnf cache
echo "[5/7] Updating package cache..."
dnf clean all
dnf makecache
# Step 6: Install NVIDIA driver (latest stable for H100)
echo "[6/7] Installing NVIDIA driver..."
# Install the latest stable driver - version 570.x is recommended for H100
# We'll install from NVIDIA's repository which should have the latest stable version
dnf install -y cuda-drivers
# Alternative: If you want to install a specific version, you can use:
# dnf install -y nvidia-driver-570
# Step 7: Install CUDA Toolkit 12.4 (latest stable for H100)
echo "[7/7] Installing CUDA Toolkit 12.4..."
dnf install -y cuda-toolkit-12-4
# Create symlinks for CUDA
if [ -d /usr/local/cuda-12.4 ]; then
ln -sf /usr/local/cuda-12.4 /usr/local/cuda
fi
# Add CUDA to PATH
if ! grep -q "/usr/local/cuda/bin" /etc/profile.d/cuda.sh 2>/dev/null; then
echo 'export PATH=/usr/local/cuda/bin${PATH:+:${PATH}}' > /etc/profile.d/cuda.sh
echo 'export LD_LIBRARY_PATH=/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}' >> /etc/profile.d/cuda.sh
chmod +x /etc/profile.d/cuda.sh
fi
echo ""
echo "=========================================="
echo "Installation Complete!"
echo "=========================================="
echo ""
echo "IMPORTANT: You must reboot the system for the NVIDIA driver to load."
echo ""
echo "After reboot, verify installation with:"
echo " nvidia-smi"
echo " nvcc --version"
echo ""
echo "To reboot now, run: reboot"
echo ""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment