mpiDevices #

Description #

Code to get started with using MPI with nvfortran. All it does is check compute mode:

default (0): multiple host threads can use a single GPU
exclusive (1): one host thread can use a single GPU at a time
prohibited (2): No host threads can use the GPU
exclusive process (3): Single contect cna be created by a single process, but that process can be current to all threads of that process.

GPUs in default mode will allow for multiple MPI processes to be assigned to a single GPU, whereas exclusive and exclusive process will only allow one MPI process per GPU.

Code (C++) #

To do...

Code (Fortran) #

program mpiDevices

    use cudafor
    use mpi

    implicit none
    ! global array size
    integer, parameter:: n = 1024*1024
    ! MPI variables
    integer:: procid, numprocs, ierr
    ! device
    type(cudaDeviceProp):: prop
    integer(int_ptr_kind()):: freeB, totalB, freeA, totalA
    real, device, allocatable:: d(:)
    integer:: i, j, istat, devid

    ! MPI initialization
    call MPI_INIT(ierr)
    call MPI_COMM_RANK(MPI_COMM_WORLD, procid, ierr)
    call MPI_COMM_SIZE(MPI_COMM_WORLD, numprocs, ierr)

    ! print compute mode for device
    istat = cudaGetDevice(devid)
    istat = cudaGetDeviceProperties(prop, devid)

    do i = 1, numprocs
        call MPI_BARRIER(MPI_COMM_WORLD, ierr)
        if (procid == i) write(*, &
            "('[',i0,'] using device: ', i0, ' in compute mode: ', i0)") &
            procid, devid, prop%computeMode
    end do

    ! get memory use before large allocations
    call MPI_BARRIER(MPI_COMM_WORLD, ierr)
    istat = cudaMemGetInfo(freeB, totalB)

    ! now allocate arrays, one rank at a time
    do j = 0, numprocs-1

        ! allocate on device associated with rank j
        call MPI_BARRIER(MPI_COMM_WORLD, ierr)
        if (procid == j) allocate(d(n))

        ! Get free memory after allocation
        call MPI_BARRIER(MPI_COMM_WORLD, ierr)
        istat = cudaMemGetInfo(freeA, totalA)

        write(*, "(' [',i0,'] after allocation on rank: ', i0, &
            ', device arrays allocated: ', i0)") &
            procid, devid, (freeB-freeA)/n/4

    end do

    deallocate(d)

    call MPI_FINALIZE(ierr)

end program mpiDevices