Created
November 24, 2025 14:24
-
-
Save maziyarpanahi/ffbeffdce95dbfc9dadbb7adf1ce3ead to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # NVIDIA Driver and CUDA Installation Script for RHEL 9.4 with H100 GPUs | |
| # This script installs the latest stable NVIDIA driver and CUDA toolkit | |
| set -e # Exit on error | |
| echo "==========================================" | |
| echo "NVIDIA H100 Driver & CUDA Installation" | |
| echo "OS: Red Hat Enterprise Linux 9.4" | |
| echo "Kernel: $(uname -r)" | |
| echo "GPUs: 8x NVIDIA H100L" | |
| echo "==========================================" | |
| echo "" | |
| # Check if running as root | |
| if [ "$EUID" -ne 0 ]; then | |
| echo "ERROR: This script must be run as root (use sudo)" | |
| exit 1 | |
| fi | |
| # Step 1: Install kernel development packages matching current kernel | |
| echo "[1/7] Installing kernel development packages..." | |
| KERNEL_VERSION=$(uname -r) | |
| dnf install -y kernel-devel-${KERNEL_VERSION} kernel-headers-${KERNEL_VERSION} || { | |
| echo "Warning: Exact kernel-devel not found, installing latest..." | |
| dnf install -y kernel-devel kernel-headers | |
| } | |
| # Step 2: Install build tools and dependencies | |
| echo "[2/7] Installing build tools and dependencies..." | |
| dnf install -y gcc make dkms acpid libglvnd-glx libglvnd-opengl libglvnd-devel pkgconfig | |
| # Step 3: Disable nouveau driver (NVIDIA's open-source alternative) | |
| echo "[3/7] Disabling nouveau driver..." | |
| if [ -f /etc/modprobe.d/blacklist-nouveau.conf ]; then | |
| echo "blacklist-nouveau.conf already exists" | |
| else | |
| echo "blacklist nouveau" > /etc/modprobe.d/blacklist-nouveau.conf | |
| echo "options nouveau modeset=0" >> /etc/modprobe.d/blacklist-nouveau.conf | |
| fi | |
| # Backup initramfs | |
| if [ -f /boot/initramfs-${KERNEL_VERSION}.img ]; then | |
| dracut --force | |
| fi | |
| # Step 4: Add NVIDIA CUDA repository | |
| echo "[4/7] Adding NVIDIA CUDA repository..." | |
| # For RHEL 9, we'll use NVIDIA's official repository | |
| cat > /etc/yum.repos.d/cuda-rhel9.repo << 'EOF' | |
| [cuda-rhel9] | |
| name=CUDA for RHEL 9 | |
| baseurl=https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64 | |
| enabled=1 | |
| gpgcheck=1 | |
| gpgkey=https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/D42D0685.pub | |
| EOF | |
| # Import NVIDIA GPG key | |
| rpm --import https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/D42D0685.pub | |
| # Step 5: Clean and update dnf cache | |
| echo "[5/7] Updating package cache..." | |
| dnf clean all | |
| dnf makecache | |
| # Step 6: Install NVIDIA driver (latest stable for H100) | |
| echo "[6/7] Installing NVIDIA driver..." | |
| # Install the latest stable driver - version 570.x is recommended for H100 | |
| # We'll install from NVIDIA's repository which should have the latest stable version | |
| dnf install -y cuda-drivers | |
| # Alternative: If you want to install a specific version, you can use: | |
| # dnf install -y nvidia-driver-570 | |
| # Step 7: Install CUDA Toolkit 12.4 (latest stable for H100) | |
| echo "[7/7] Installing CUDA Toolkit 12.4..." | |
| dnf install -y cuda-toolkit-12-4 | |
| # Create symlinks for CUDA | |
| if [ -d /usr/local/cuda-12.4 ]; then | |
| ln -sf /usr/local/cuda-12.4 /usr/local/cuda | |
| fi | |
| # Add CUDA to PATH | |
| if ! grep -q "/usr/local/cuda/bin" /etc/profile.d/cuda.sh 2>/dev/null; then | |
| echo 'export PATH=/usr/local/cuda/bin${PATH:+:${PATH}}' > /etc/profile.d/cuda.sh | |
| echo 'export LD_LIBRARY_PATH=/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}' >> /etc/profile.d/cuda.sh | |
| chmod +x /etc/profile.d/cuda.sh | |
| fi | |
| echo "" | |
| echo "==========================================" | |
| echo "Installation Complete!" | |
| echo "==========================================" | |
| echo "" | |
| echo "IMPORTANT: You must reboot the system for the NVIDIA driver to load." | |
| echo "" | |
| echo "After reboot, verify installation with:" | |
| echo " nvidia-smi" | |
| echo " nvcc --version" | |
| echo "" | |
| echo "To reboot now, run: reboot" | |
| echo "" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment