Created
August 10, 2024 07:56
-
-
Save carstenbauer/778f9226426713ba8462a9dd670bc0bf to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # 2D linear diffusion solver - GPU MPI | |
| using Printf | |
| using JLD2 | |
| using CUDA | |
| # include(joinpath(@__DIR__, "../../shared.jl")) | |
| include(joinpath(@__DIR__, "shared.jl")) | |
| if !isinteger(sqrt(Threads.nthreads())) | |
| error("Number of threads must be square, e.g. 4, 9, or 16.") | |
| end | |
| # convenience macros simply to avoid writing nested finite-difference expression | |
| macro qx(ix, iy) esc(:(-D * (C[$ix+1, $iy] - C[$ix, $iy]) * inv(dx))) end | |
| macro qy(ix, iy) esc(:(-D * (C[$ix, $iy+1] - C[$ix, $iy]) * inv(dy))) end | |
| function diffusion_step_kernel!(params, C2, C) | |
| (; dx, dy, dt, D) = params | |
| ix = (blockIdx().x - 1) * blockDim().x + threadIdx().x # CUDA vectorised unique index | |
| iy = (blockIdx().y - 1) * blockDim().y + threadIdx().y # CUDA vectorised unique index | |
| if ix <= size(C, 1)-2 && iy <= size(C, 2)-2 | |
| @inbounds C2[ix+1, iy+1] = C[ix+1, iy+1] - dt * ((@qx(ix + 1, iy + 1) - @qx(ix, iy + 1)) * inv(dx) + | |
| (@qy(ix + 1, iy + 1) - @qy(ix + 1, iy)) * inv(dy)) | |
| end | |
| return nothing | |
| end | |
| function diffusion_step!(params, C2, C) | |
| (; nthreads, nblocks) = params | |
| @cuda threads = nthreads blocks = nblocks diffusion_step_kernel!(params, C2, C) | |
| return nothing | |
| end | |
| # MPI functions | |
| @views function update_halo!(A, bufs, neighbors, comm) | |
| # | |
| # Help: | |
| # left neighbor: neighbors.x[1] | |
| # right neighbor: neighbors.x[2] | |
| # up neighbor: neighbors.y[1] | |
| # down neighbor: neighbors.y[2] | |
| # | |
| # dim-1 (x) | |
| (neighbors.x[1] != MPI.PROC_NULL) && copyto!(bufs.send_1_1, A[2 , :]) | |
| (neighbors.x[2] != MPI.PROC_NULL) && copyto!(bufs.send_1_2, A[end-1, :]) | |
| reqs = MPI.MultiRequest(4) | |
| (neighbors.x[1] != MPI.PROC_NULL) && MPI.Irecv!(bufs.recv_1_1, comm, reqs[1]; source=neighbors.x[1]) | |
| (neighbors.x[2] != MPI.PROC_NULL) && MPI.Irecv!(bufs.recv_1_2, comm, reqs[2]; source=neighbors.x[2]) | |
| (neighbors.x[1] != MPI.PROC_NULL) && MPI.Isend(bufs.send_1_1, comm, reqs[3]; dest=neighbors.x[1]) | |
| (neighbors.x[2] != MPI.PROC_NULL) && MPI.Isend(bufs.send_1_2, comm, reqs[4]; dest=neighbors.x[2]) | |
| MPI.Waitall(reqs) # blocking | |
| (neighbors.x[1] != MPI.PROC_NULL) && copyto!(A[1 , :], bufs.recv_1_1) | |
| (neighbors.x[2] != MPI.PROC_NULL) && copyto!(A[end, :], bufs.recv_1_2) | |
| # dim-2 (y) | |
| (neighbors.y[1] != MPI.PROC_NULL) && copyto!(bufs.send_2_1, A[:, 2 ]) | |
| (neighbors.y[2] != MPI.PROC_NULL) && copyto!(bufs.send_2_2, A[:, end-1]) | |
| reqs = MPI.MultiRequest(4) | |
| (neighbors.y[1] != MPI.PROC_NULL) && MPI.Irecv!(bufs.recv_2_1, comm, reqs[1]; source=neighbors.y[1]) | |
| (neighbors.y[2] != MPI.PROC_NULL) && MPI.Irecv!(bufs.recv_2_2, comm, reqs[2]; source=neighbors.y[2]) | |
| (neighbors.y[1] != MPI.PROC_NULL) && MPI.Isend(bufs.send_2_1, comm, reqs[3]; dest=neighbors.y[1]) | |
| (neighbors.y[2] != MPI.PROC_NULL) && MPI.Isend(bufs.send_2_2, comm, reqs[4]; dest=neighbors.y[2]) | |
| MPI.Waitall(reqs) # blocking | |
| (neighbors.y[1] != MPI.PROC_NULL) && copyto!(A[:, 1 ], bufs.recv_2_1) | |
| (neighbors.y[2] != MPI.PROC_NULL) && copyto!(A[:, end], bufs.recv_2_2) | |
| return | |
| end | |
| function init_bufs(A) | |
| return (; send_1_1=CUDA.zeros(Float64, size(A, 2)), send_1_2=CUDA.zeros(Float64, size(A, 2)), | |
| send_2_1=CUDA.zeros(Float64, size(A, 1)), send_2_2=CUDA.zeros(Float64, size(A, 1)), | |
| recv_1_1=CUDA.zeros(Float64, size(A, 2)), recv_1_2=CUDA.zeros(Float64, size(A, 2)), | |
| recv_2_1=CUDA.zeros(Float64, size(A, 1)), recv_2_2=CUDA.zeros(Float64, size(A, 1))) | |
| end | |
| function run_diffusion(; ns=64, nt=100, do_save=false) | |
| # MPI.Init() | |
| # comm = MPI.COMM_WORLD | |
| # nprocs = MPI.Comm_size(comm) | |
| # dims = MPI.Dims_create(nprocs, (0, 0)) |> Tuple | |
| # comm_cart = MPI.Cart_create(comm, dims) | |
| d = isqrt(Threads.nthreads()) | |
| dims = (d,d) | |
| allids = 0:(Threads.nthreads()-1) | |
| allcoords = divrem.(allids, d) | |
| idgrid = permutedims(reshape(allids, (d,d)), (2,1)) | |
| coordgrid = permutedims(reshape(allcoords, (d,d)), (2,1)) | |
| display(idgrid) | |
| display(coordgrid) | |
| getneighbors = (coords) -> begin | |
| coords[1] | |
| end | |
| # | |
| # 0 1 | |
| # 2 3 | |
| # | |
| return | |
| # me = Threads.threadid() - 1 | |
| # coords = divrem(me, d) | |
| neighbors = (; x=MPI.Cart_shift(comm_cart, 0, 1), y=MPI.Cart_shift(comm_cart, 1, 1)) | |
| # select GPU on multi-GPU system based on shared memory topology | |
| comm_l = MPI.Comm_split_type(comm, MPI.COMM_TYPE_SHARED, me) | |
| me_l = MPI.Comm_rank(comm_l) | |
| # set GPU, but only if more than one device present | |
| gpu_id = CUDA.device!(me_l % ndevices()) | |
| println("$(gpu_id), out of: $(ndevices())") | |
| (me == 0) && println("nprocs = $(nprocs), dims = $dims") | |
| params = init_params_gpu_mpi(; dims, coords, ns, nt, do_save) | |
| C, C2 = init_arrays_gpu_mpi(params) | |
| bufs = init_bufs(C) | |
| t_tic = 0.0 | |
| # Time loop | |
| for it in 1:nt | |
| # time after warmup (ignore first 10 iterations) | |
| (it == 11) && (t_tic = Base.time()) | |
| # diffusion | |
| diffusion_step!(params, C2, C) | |
| update_halo!(C2, bufs, neighbors, comm_cart) | |
| C, C2 = C2, C # pointer swap | |
| end | |
| # synchronize the gpu before querying the final time | |
| CUDA.synchronize() | |
| t_toc = (Base.time() - t_tic) | |
| # "master" prints performance | |
| (me == 0) && print_perf(params, t_toc) | |
| # save to (maybe) visualize later | |
| if do_save | |
| jldsave(joinpath(@__DIR__, "out_$(me).jld2"); C = Array(C[2:end-1, 2:end-1]), lxy = (; lx=params.L, ly=params.L)) | |
| end | |
| MPI.Finalize() | |
| return | |
| end | |
| # Running things... | |
| # enable save to disk by default | |
| (!@isdefined do_save) && (do_save = false) | |
| # enable execution by default | |
| (!@isdefined do_run) && (do_run = true) | |
| if do_run | |
| # run_diffusion(; ns=256, do_save) | |
| run_diffusion(; ns=16384, do_save=false) | |
| end |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
incomplete, check
diffusion_2d_threads_weakscaling.jlfirst