transposeMPI #
Description #
Code showing the use of tiling and shared memory in transposing a matrix. The book uses it as an example of the performance difference between CUDA aware MPI vs non-MPI transfers (transfers between GPUs via their respective host CPUs). The code will work on GPUs communicating across node boundaries.
Code (C++) #
To do...
Code (Fortran) #
module transpose_m
implicit none
integer, parameter:: cudaTileDim = 32
integer, parameter:: blockRows = 8
contains
attributes(global) subroutine cudaTranspose(odata, ldo, idata, ldi)
real, intent(out):: odata(ldo, *)
real, intent(in):: idata(ldi, *)
integer, value, intent(in):: ldo, ldi
real, shared:: tile(cudaTileDim+1, cudaTileDim)
integer:: x, y, j
x = (blockIdx%x-1) * cudaTileDim + threadIdx%x
y = (blockIdx%y-1) * cudaTileDim + threadIdx%y
do j = 0, cudaTileDim-1, blockRows
tile(threadIdx%x, threadIdx%y+j) = idata(x, y+j)
end do
call syncthreads()
x = (blockIdx%y-1) * cudaTileDim + threadIdx%x
y = (blockIdx%x-1) * cudaTileDim + threadIdx%y
do j = 0, cudaTileDim-1, blockRows
odata(x, y+j) = tile(threadIdx%y+j, threadIdx%x)
end do
end subroutine
end module transpose_m
!
! Main code
!
program transpose_MPI
use cudafor
use mpi
use transpose_m
use mpiDeviceUtil
implicit none
! global array size
integer, parameter:: nx = 2048, ny = 2048
! host arrays (global)
real:: idata_h(nx, ny), tdata_h(ny, nx), gold(ny, nx)
! CUDA vars and device arrays
integer:: deviceID, nDevices
type(dim3):: dimGrid, dimBlock
real, device, allocatable:: idata_d(:, :), tdata_d(:, :), sTile_d(:, :), &
rTile_d(:, :)
! MPI stuff
integer:: mpiTileDimX, mpiTileDimY
integer:: procid, numprocs, tag, ierr, localRank
integer:: nstages, stage, sRank, rRank
integer:: status(MPI_STATUS_SIZE)
double precision:: timeStart, timeStop
character(len=10):: localRankStr
integer:: i, j, nyl, jl, jg, p
integer:: xOffset, yOffset
! MPI initialization
call MPI_INIT(ierr)
call MPI_COMM_RANK(MPI_COMM_WORLD, procid, ierr)
call MPI_COMM_SIZE(MPI_COMM_WORLD, numprocs, ierr)
ierr = cudaGetDeviceCount(nDevices)
! check parameters and calculate execution configuration
if (mod(nx, numprocs) == 0 .and. mod(ny, numprocs) == 0) then
mpiTileDimX = nx/numprocs
mpiTileDimY = ny/numprocs
else
write(*,*) "ny must be an integral multiple of numprocs"
call MPI_ABORT(MPI_COMM_WORLD, 1, ierr)
end if
if (mod(mpiTileDimX, cudaTileDim) /= 0 .or. mod(mpiTileDimY, cudaTileDim) &
/= 0) then
write(*,*) "mpiTileDimX and mpiTileDimY must be an integral multiple", &
" of cudaTileDim"
call MPI_ABORT(MPI_COMM_WORLD, 1, ierr)
end if
if (numprocs > nDevices) then
write(*,*) "numprcs must be <= number of GPUs on the system!"
call MPI_ABORT(MPI_COMM_WORLD, 1, ierr)
end if
! each MPI process
call assignDevice(procid, numprocs, deviceID)
dimGrid = dim3(mpiTileDimX/cudaTileDim, mpiTileDimY/cudaTileDim, 1)
dimBlock = dim3(cudaTileDim, blockRows, 1)
! write parameters to terminal
if (procid == 0) then
write(*,*)
write(*,"('Array size: ', i0,' x ',i0,/)") nx, ny
write(*,"('CUDA block size: ', i0,' x ',i0,', CUDA tile size: ', &
i0,' x ',i0,/)") cudaTileDim, blockRows, cudaTileDim, cudaTileDim
write(*,"('dimGrid: ', i0,' x ',i0,' x ',i0,' dimBlock: ', i0,&
' x ',i0,' x ',i0,/)") dimGrid%x, dimGrid%y, dimGrid%x, &
dimBlock%x, dimBlock%y, dimBlock%z
write(*,"('numprocs: ', i0, ', Local input array size: ', &
i0,' x ',i0,/)") numprocs, nx, mpiTileDimY
write(*,"('mpiTileDim: ', i0,' x ',i0,/)") mpiTileDimX, mpiTileDimY
end if
! initalize data
! host - each process has entire array on host (for now)
do p = 0, numprocs-1
do jl = 1, mpiTileDimY
jg = p*mpiTileDimY + jl
do i = 1, nx
idata_h(i, jg) = i + (jg-1)*nx
end do
end do
end do
gold = transpose(idata_h)
! device - each process has nx*mpiTileDimY = ny*mpiTileDimX elements
allocate( idata_d(nx, mpiTileDimY), &
tdata_d(ny, mpiTileDimX), &
sTile_d(mpiTileDimX, mpiTileDimY), &
rTile_d(mpiTileDimX, mpiTileDimY))
yOffset = procid*mpiTileDimY
idata_d(1:nx, 1:mpiTileDimY) = idata_h(1:nx, yOffset+1:yOffset+mpiTileDimY)
tdata_d = -1.0
! ---------
! transpose
! ---------
call MPI_BARRIER(MPI_COMM_WORLD, ierr)
timeStart = MPI_WTIME()
! 0th stage - local transpose
call cudaTranspose<<<dimGrid, dimBlock>>>&
(tdata_d(procid*mpiTileDimY+1, 1), ny, &
idata_d(procid*mpiTileDimX+1, 1), nx)
! other stages that involve MPI transfers
do stage = 1, numprocs-1
! sRank = the rank to which procid send data to
! rRank = the rank from which myrank receives data
sRank = modulo(procid-stage, numprocs)
rRank = modulo(procid+stage, numprocs)
call MPI_BARRIER(MPI_COMM_WORLD, ierr)
! pack tile so data to be sent is contiguous
!$cuf kernel do (2) <<<*, *>>>
do j = 1, mpiTileDimY
do i = 1, mpiTileDimX
sTile_d(i, j) = idata_d(sRank*mpiTileDimX+i, j)
end do
end do
call MPI_SENDRECV(sTile_d, mpiTileDimX*mpiTileDimY, MPI_REAL, sRank, &
procid, rTile_d, mpiTileDimX*mpiTileDimY, MPI_REAL, rRank, rRank, &
MPI_COMM_WORLD, status, ierr)
! do transpose from receive tile into final array
! (no need to unpack)
call cudaTranspose<<<dimGrid, dimBlock>>>&
(tdata_d(rRank*mpiTileDimY+1, 1), ny, rTile_d, mpiTileDimX)
end do
call MPI_BARRIER(MPI_COMM_WORLD, ierr)
timeStop = MPI_WTIME()
! check results
tdata_h = tdata_d
xOffset = procid*mpiTileDimX
if (all(tdata_h(1:ny, 1:mpiTileDimX) == &
gold(1:ny, xOffset+1:xOffset+mpiTileDimX))) then
if (procid == 0) then
write(*,"('Bandwidth (GB/s): ', f7.2,/)") 2.*(nx*ny*4)/&
(1.e9*(timeStop-timeStart))
end if
else
write(*,"('[',i0,']', '*** Failed ***')") procid
end if
deallocate(idata_d, tdata_d, sTile_d, rTile_d)
call MPI_FINALIZE(ierr)
end program transpose_MPI