EBNull/_gpu.nix

## _gpu.nix
{
  config,
  pkgs,
  lib,
  ...
}:
let
  modules = {
    wsl = {
      # Always required
      nixpkgs.config.allowUnfree = true; # for nvidia
      wsl.useWindowsDriver = true;
    };
    nix-ld = {
      # Always required: WSLg D3D12 driver is an unpatched binary, so it needs nix-ld to run
      programs.nix-ld.enable = true;
    };
    ld-lib-path = {
      # Always required: programs need to find WSLg driver
      environment.sessionVariables.LD_LIBRARY_PATH = lib.makeLibraryPath [
        "/run/opengl-driver" # Native Nix applications (like niri or mesa) are built with fixed RPATHs pointing to the Nix Store, but they do not know the path to the WSLg drivers.
        pkgs.openssl # WSLg D3D12 driver may require openssl, but it does not know the path to the Nix Store
      ];
    };
    mesa = {
      # Generally required: host needs settings mesa to use d3d directly
      # https://github.com/nix-community/NixOS-WSL/issues/454#issuecomment-3802332145
      # Mesa should use d3d12 directly instead of Zink, and in doing so, it should use nvidia
      environment.sessionVariables = {
        GALLIUM_DRIVER = "d3d12";
        MESA_D3D12_DEFAULT_ADAPTER_NAME = "Nvidia";
      };
      # Not technically required, but helpful for debugging with glxinfo
      environment.systemPackages = [
        pkgs.mesa-demos
      ];
    };
    docker = {
      # Required only for containers to be able to use the gpu
      virtualisation.docker = {
        enable = true;
        rootless = {
          enable = false;
          setSocketVariable = true;
          daemon.settings = {
            features.cdi = true;
            cdi-spec-dirs = [ "/home/${config.wsl.defaultUser}/.cdi" ];
          };
        };
        daemon.settings = {
          features.cdi = true;
        };
      };
      users.users.${config.wsl.defaultUser}.extraGroups = [ "docker" ];

      hardware.nvidia-container-toolkit.enable = true;
      hardware.nvidia-container-toolkit.mount-nvidia-executables = true;
      hardware.nvidia-container-toolkit.suppressNvidiaDriverAssertion = true; # provided by NixOS-WSL

      # enableNvidia is deprecated and replaced by above
      virtualisation.docker.enableNvidia = true;
    };
    hwg = {
      # Required for docker usage
      hardware.graphics = {
        enable = true;
        enable32Bit = true;
        extraPackages = [ pkgs.mesa ];
      };
    };
    cuda = {
      # Required only for host-level cuda
      environment.sessionVariables = {
        CUDA_PATH = "${pkgs.cudatoolkit}";
      };
    };
    xdriver = {
      # DO NOT USE: this replaces nvidia-smi provided by WSL with one expecting native access
      hardware.nvidia.open = true;
      services.xserver.videoDrivers = lib.mkDefault [ "nvidia" ];
    };
    vainfo = {
      environment.variables.LIBVA_DRIVER_NAME = "d3d12";
      environment.systemPackages = [ pkgs.libva-utils ];

      # For some reason boot.kernelModules does not work for WSL2
      #boot.kernelModules = [ "vgem" ];

      systemd.services.vgem = {
        enable = true;
        serviceConfig.Type = "oneshot";
        wantedBy = [ "multi-user.target" ];
        after = [ "wslg.service" ];
        path = [
          pkgs.kmod
          pkgs.bash
        ];
        script = ''
          ${pkgs.kmod}/bin/modprobe vgem
        '';
      };
    };
    tests = {
      environment.systemPackages = [
        (pkgs.writeShellApplication {
          name = "checkgpu";
          runtimeInputs = [ ];
          text = ''
            echo "glxinfo"
            ( set -x; glxinfo -B ) | grep -E "(Vendor:|Device:)"
            echo
            echo "nvidia-smi"
            ( set -x; nvidia-smi -L )
            echo
            echo "check vgem kernel module"
            (lsmod | grep "^vgem") || echo "vgem is not loaded!" ;
            echo
            echo "check /dev/dri"
            ( [ -e /dev/dri/by-path ] && ls -l /dev/dri/by-path) || echo "/dev/dri/by-path does not exist"
            echo
            echo "check vainfo"
            ( (set -x; vainfo --display drm ) 2>&1 | grep -E '^(\+|vainfo: Driver)' ) || echo "vainfo not using gpu accel"
            echo
            echo "check docker run"
            (set -x; docker run --gpus all nvcr.io/nvidia/k8s/cuda-sample:nbody nbody -gpu -benchmark )
          '';
        })
      ];
    };
  };
in
{
  _class = "nixos";

  imports = builtins.attrValues {
    inherit (modules)
      wsl
      nix-ld
      ld-lib-path
      mesa
      docker
      hwg
      # cuda
      vainfo
      tests
      ;
  };
}

## checkgpu_output.txt
glxinfo
+ glxinfo -B
    Vendor: Microsoft Corporation (0xffffffff)
    Device: D3D12 (NVIDIA GeForce RTX 5090) (0xffffffff)

nvidia-smi
+ nvidia-smi -L
GPU 0: NVIDIA GeForce RTX 5090 (UUID: GPU-6302d15c-22ee-fd09-8087-67a3bcad83cd)

check vgem kernel module
vgem 12288 0 - Live 0x0000000000000000

check /dev/dri
total 0
lrwxrwxrwx 1 root root  8 Mar  6 15:35 platform-vgem-card -> ../card0
lrwxrwxrwx 1 root root 13 Mar  6 15:35 platform-vgem-render -> ../renderD128

check vainfo
+ vainfo --display drm
vainfo: Driver version: Mesa Gallium driver 25.2.6 for D3D12 (NVIDIA GeForce RTX 5090)

check docker run
+ docker run --gpus all nvcr.io/nvidia/k8s/cuda-sample:nbody nbody -gpu -benchmark
Run "nbody -benchmark [-numbodies=<numBodies>]" to measure performance.
	-fullscreen       (run n-body simulation in fullscreen mode)
	-fp64             (use double precision floating point values for simulation)
	-hostmem          (stores simulation data in host memory)
	-benchmark        (run benchmark to measure performance)
	-numbodies=<N>    (number of bodies (>= 1) to run in simulation)
	-device=<d>       (where d=0,1,2.... for the CUDA device to use)
	-numdevices=<i>   (where i=(number of CUDA devices > 0) to use for simulation)
	-compare          (compares simulation results running once on the default GPU and once on the CPU)
	-cpu              (run n-body simulation on the CPU)
	-tipsy=<file.bin> (load a tipsy model file for simulation)

NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.

> Windowed mode
> Simulation data stored in video memory
> Single precision floating point simulation
> 1 Devices used for simulation
MapSMtoCores for SM 12.0 is undefined.  Default to use 128 Cores/SM
MapSMtoArchName for SM 12.0 is undefined.  Default to use Ampere
GPU Device 0: "Ampere" with compute capability 12.0

> Compute 12.0 CUDA device: [NVIDIA GeForce RTX 5090]
174080 bodies, total time for 10 iterations: 111.739 ms
= 2712.023 billion interactions per second
= 54240.452 single-precision GFLOP/s at 20 flops per interaction
	{
	config,
	pkgs,
	lib,
	...
	}:
	let
	modules = {
	wsl = {
	# Always required
	nixpkgs.config.allowUnfree = true; # for nvidia
	wsl.useWindowsDriver = true;
	};
	nix-ld = {
	# Always required: WSLg D3D12 driver is an unpatched binary, so it needs nix-ld to run
	programs.nix-ld.enable = true;
	};
	ld-lib-path = {
	# Always required: programs need to find WSLg driver
	environment.sessionVariables.LD_LIBRARY_PATH = lib.makeLibraryPath [
	"/run/opengl-driver" # Native Nix applications (like niri or mesa) are built with fixed RPATHs pointing to the Nix Store, but they do not know the path to the WSLg drivers.
	pkgs.openssl # WSLg D3D12 driver may require openssl, but it does not know the path to the Nix Store
	];
	};
	mesa = {
	# Generally required: host needs settings mesa to use d3d directly
	# https://github.com/nix-community/NixOS-WSL/issues/454#issuecomment-3802332145
	# Mesa should use d3d12 directly instead of Zink, and in doing so, it should use nvidia
	environment.sessionVariables = {
	GALLIUM_DRIVER = "d3d12";
	MESA_D3D12_DEFAULT_ADAPTER_NAME = "Nvidia";
	};
	# Not technically required, but helpful for debugging with glxinfo
	environment.systemPackages = [
	pkgs.mesa-demos
	];
	};
	docker = {
	# Required only for containers to be able to use the gpu
	virtualisation.docker = {
	enable = true;
	rootless = {
	enable = false;
	setSocketVariable = true;
	daemon.settings = {
	features.cdi = true;
	cdi-spec-dirs = [ "/home/${config.wsl.defaultUser}/.cdi" ];
	};
	};
	daemon.settings = {
	features.cdi = true;
	};
	};
	users.users.${config.wsl.defaultUser}.extraGroups = [ "docker" ];

	hardware.nvidia-container-toolkit.enable = true;
	hardware.nvidia-container-toolkit.mount-nvidia-executables = true;
	hardware.nvidia-container-toolkit.suppressNvidiaDriverAssertion = true; # provided by NixOS-WSL

	# enableNvidia is deprecated and replaced by above
	virtualisation.docker.enableNvidia = true;
	};
	hwg = {
	# Required for docker usage
	hardware.graphics = {
	enable = true;
	enable32Bit = true;
	extraPackages = [ pkgs.mesa ];
	};
	};
	cuda = {
	# Required only for host-level cuda
	environment.sessionVariables = {
	CUDA_PATH = "${pkgs.cudatoolkit}";
	};
	};
	xdriver = {
	# DO NOT USE: this replaces nvidia-smi provided by WSL with one expecting native access
	hardware.nvidia.open = true;
	services.xserver.videoDrivers = lib.mkDefault [ "nvidia" ];
	};
	vainfo = {
	environment.variables.LIBVA_DRIVER_NAME = "d3d12";
	environment.systemPackages = [ pkgs.libva-utils ];

	# For some reason boot.kernelModules does not work for WSL2
	#boot.kernelModules = [ "vgem" ];

	systemd.services.vgem = {
	enable = true;
	serviceConfig.Type = "oneshot";
	wantedBy = [ "multi-user.target" ];
	after = [ "wslg.service" ];
	path = [
	pkgs.kmod
	pkgs.bash
	];
	script = ''
	${pkgs.kmod}/bin/modprobe vgem
	'';
	};
	};
	tests = {
	environment.systemPackages = [
	(pkgs.writeShellApplication {
	name = "checkgpu";
	runtimeInputs = [ ];
	text = ''
	echo "glxinfo"
	( set -x; glxinfo -B ) \| grep -E "(Vendor:\|Device:)"
	echo
	echo "nvidia-smi"
	( set -x; nvidia-smi -L )
	echo
	echo "check vgem kernel module"
	(lsmod \| grep "^vgem") \|\| echo "vgem is not loaded!" ;
	echo
	echo "check /dev/dri"
	( [ -e /dev/dri/by-path ] && ls -l /dev/dri/by-path) \|\| echo "/dev/dri/by-path does not exist"
	echo
	echo "check vainfo"
	( (set -x; vainfo --display drm ) 2>&1 \| grep -E '^(\+\|vainfo: Driver)' ) \|\| echo "vainfo not using gpu accel"
	echo
	echo "check docker run"
	(set -x; docker run --gpus all nvcr.io/nvidia/k8s/cuda-sample:nbody nbody -gpu -benchmark )
	'';
	})
	];
	};
	};
	in
	{
	_class = "nixos";

	imports = builtins.attrValues {
	inherit (modules)
	wsl
	nix-ld
	ld-lib-path
	mesa
	docker
	hwg
	# cuda
	vainfo
	tests
	;
	};
	}
	glxinfo
	+ glxinfo -B
	Vendor: Microsoft Corporation (0xffffffff)
	Device: D3D12 (NVIDIA GeForce RTX 5090) (0xffffffff)

	nvidia-smi
	+ nvidia-smi -L
	GPU 0: NVIDIA GeForce RTX 5090 (UUID: GPU-6302d15c-22ee-fd09-8087-67a3bcad83cd)

	check vgem kernel module
	vgem 12288 0 - Live 0x0000000000000000

	check /dev/dri
	total 0
	lrwxrwxrwx 1 root root 8 Mar 6 15:35 platform-vgem-card -> ../card0
	lrwxrwxrwx 1 root root 13 Mar 6 15:35 platform-vgem-render -> ../renderD128

	check vainfo
	+ vainfo --display drm
	vainfo: Driver version: Mesa Gallium driver 25.2.6 for D3D12 (NVIDIA GeForce RTX 5090)

	check docker run
	+ docker run --gpus all nvcr.io/nvidia/k8s/cuda-sample:nbody nbody -gpu -benchmark
	Run "nbody -benchmark [-numbodies=<numBodies>]" to measure performance.
	-fullscreen (run n-body simulation in fullscreen mode)
	-fp64 (use double precision floating point values for simulation)
	-hostmem (stores simulation data in host memory)
	-benchmark (run benchmark to measure performance)
	-numbodies=<N> (number of bodies (>= 1) to run in simulation)
	-device=<d> (where d=0,1,2.... for the CUDA device to use)
	-numdevices=<i> (where i=(number of CUDA devices > 0) to use for simulation)
	-compare (compares simulation results running once on the default GPU and once on the CPU)
	-cpu (run n-body simulation on the CPU)
	-tipsy=<file.bin> (load a tipsy model file for simulation)

	NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.

	> Windowed mode
	> Simulation data stored in video memory
	> Single precision floating point simulation
	> 1 Devices used for simulation
	MapSMtoCores for SM 12.0 is undefined. Default to use 128 Cores/SM
	MapSMtoArchName for SM 12.0 is undefined. Default to use Ampere
	GPU Device 0: "Ampere" with compute capability 12.0

	> Compute 12.0 CUDA device: [NVIDIA GeForce RTX 5090]
	174080 bodies, total time for 10 iterations: 111.739 ms
	= 2712.023 billion interactions per second
	= 54240.452 single-precision GFLOP/s at 20 flops per interaction