mpiDeviceUtil

mpiDeviceUtil #

Description #

Basic module to assign MPI processes to unique GPUs. This is modified from the code in the book to use MPI_ALLGATHER, an explicit definition of the quicksort subroutine, and the use of hostnm Fortran intrinsic function instead of the MPI_GET_PROCESSOR_NAME.

Code (C++) #

To do...

Code (Fortran) #

module mpiDeviceUtil
contains

    subroutine assignDevice(procid, numprocs, dev)

        use mpi
        use cudafor

        implicit none
        integer:: numprocs, procid, dev
        character(len=100), allocatable:: hosts(:)
        character(len=100):: hostname
        integer:: namelength, color, i
        integer:: newComm, newProcid, ierr
        logical:: mpiInitialized

        ! allocate array of hostnames
        allocate(hosts(0:numprocs-1))

        ! every process collects the hostname of all the nodes
        call hostnm(hostname)

        call MPI_ALLGATHER(hostname, 100, MPI_CHARACTER, hosts, 100, &
            MPI_CHARACTER, MPI_COMM_WORLD, ierr)

        call MPI_BARRIER(MPI_COMM_WORLD,ierr)

        ! sort the list of names
        call quicksort(hosts, 100, 1, numprocs)

        call MPI_BARRIER(MPI_COMM_WORLD,ierr)

        ! assign the same color to the same node
        color = 0
        do i = 0, numprocs-1
            if (i > 0) then
                if ( hosts(i-1) /= hosts(i) ) color = color + 1
            end if
            if ( hostname <= hosts(i) ) exit
        end do

        call MPI_COMM_SPLIT(MPI_COMM_WORLD, color, 0, newComm, ierr)
        call MPI_COMM_RANK(newComm, newProcid, ierr)

        dev = newProcid
        ierr = cudaSetDevice(dev)

        deallocate(hosts)

    end subroutine assignDevice

    ! quicksort.f -*-f90-*-
    ! Author: t-nissie
    ! License: GPLv3
    ! Gist: https://gist.github.com/t-nissie/479f0f16966925fa29ea
    !!
    recursive subroutine quicksort(a, strlen, first, last)
    
        implicit none
        integer, intent(in):: strlen, first, last
        character(strlen), intent(inout):: a(*)
        character(strlen)::  x, t
        integer:: i, j

        x = a( (first+last) / 2 )
        i = first
        j = last
        do
           do while (a(i) < x)
              i=i+1
           end do
           do while (x < a(j))
              j=j-1
           end do
           if (i >= j) exit
           t = a(i);  a(i) = a(j);  a(j) = t
           i=i+1
           j=j-1
        end do
        if (first < i-1) call quicksort(a, strlen, first, i-1)
        if (j+1 < last)  call quicksort(a, strlen, j+1, last)

    end subroutine quicksort

end module mpiDeviceUtil

program main

    use mpi
    use cudafor
    use mpiDeviceUtil

    implicit none
    integer, parameter:: n = 1024*1024
    ! mpi
    character(len=100):: hostname
    integer:: procid, numprocs, ierr, namelength
    ! device
    type(cudaDeviceProp):: prop
    integer(int_ptr_kind()):: freeB, totalB, freeA, totalA
    real, device, allocatable:: d(:)
    integer:: deviceID, i, istat

    call MPI_INIT(ierr)
    call MPI_COMM_RANK(MPI_COMM_WORLD, procid, ierr)
    call MPI_COMM_SIZE(MPI_COMM_WORLD, numprocs, ierr)

    ! get and set unique device
    call assignDevice(procid, numprocs, deviceID)

    ! print hostname and device ID for each rank
    call hostnm(hostname)
    do i = 0, numprocs-1
        call MPI_BARRIER(MPI_COMM_WORLD, ierr)
        if (i == procid) write(*, &
            "('[',i0,'] host: ', a, ',  device: ', i0)") &
            procid, trim(hostname), deviceID
    end do

    ! get memory use before large allocations
    call MPI_BARRIER(MPI_COMM_WORLD, ierr)
    istat = cudaMemGetInfo(freeB, totalB)

    ! allocate memory on each device
    call MPI_BARRIER(MPI_COMM_WORLD, ierr)
    allocate(d(n))

    ! get free memory after allocation
    call MPI_BARRIER(MPI_COMM_WORLD, ierr)
    istat = cudaMemGetInfo(freeA, totalA)

    do i = 0, numprocs-1
        call MPI_BARRIER(MPI_COMM_WORLD, ierr)
        if (i == procid) write(*, &
            "('  [', i0, '] ', 'device arrays allocated: ', i0)") &
            procid, (freeB-freeA)/n/4
    end do

    deallocate(d)

    call MPI_FINALIZE(ierr)

end program main