directTransfer #

Description #

Demonstration code showing the difference in data transfer rates when transfering directly between peer GPUs, versus without p2p transfer.

Running this on the GPUs at work, showed the transfer between GPUs with p2p disabled was maybe 5% slower, but this wasn’t produced reliably. The differnece was not nearly as dramatic as those in the book.

Code (C++) #

To do...

Code (Fortran) #

program directTransfer

    use cudafor

    implicit none
    integer, parameter:: N = 4*1024*1024
    real, pinned, allocatable:: a(:), b(:)
    real, device, allocatable:: a_d(:), b_d(:)

    ! these hold free and total memory before and after
    ! allocation, used to verify allocation is happening
    ! on proper devices
    integer(int_ptr_kind()), allocatable:: freeBefore(:), totalBefore(:), freeAfter(:), totalAfter(:)
    integer:: istat, nDevices, i, accessPeer, timingDev
    type(cudaDeviceProp):: prop
    type(cudaEvent):: startEvent, stopEvent
    real:: time

    ! allocate host arrays
    allocate(a(N), b(N))
    allocate(freeBefore(0: nDevices-1), totalBefore(0: nDevices-1))
    allocate(freeAfter(0: nDevices-1), totalAfter(0: nDevices-1))

    ! get device info(including total and free memory)
    ! before allocating a_d and b_d on devices 0 and 1
    istat = cudaGetDeviceCount(nDevices)
    if(nDevices < 2) then
        write(*,*) 'Need at least two CUDA capable devices '
        stop
    end if
    write(*,"('Number of CUDA -capable devices: ', i0,/)") nDevices
    do i = 0, nDevices-1
        istat = cudaGetDeviceProperties(prop, i)
        istat = cudaSetDevice(i)
        istat = cudaMemGetInfo(freeBefore(i), totalBefore(i))
    end do
    istat = cudaSetDevice(0)
    allocate(a_d(N))
    istat = cudaSetDevice(1)
    allocate(b_d(N))

    ! print out free memory before and after allocation
    write(*,"('Allocation summary ')")
    do i = 0, nDevices-1
        istat = cudaGetDeviceProperties(prop, i)
        write(*,"(' Device ', i0, ': ', a)") i, trim(prop%name)
        istat = cudaSetDevice(i)
        istat = cudaMemGetInfo(freeAfter(i), totalAfter(i))
        write(*,"(' Free memory before: ', i0, ', after: ', i0, ', difference: ',i0,/)") &
            freeBefore(i), freeAfter(i), freeBefore(i)-freeAfter(i)
    end do

    ! check whether devices 0 and 1 can use P2P
    if(nDevices > 1) then
        istat = cudaDeviceCanAccessPeer(accessPeer, 0, 1)
        if(accessPeer == 1) then
            write(*,*) 'Peer access available between 0 and 1'
        else
            write(*,*) 'Peer access not available between 0 and 1'
        end if
    end if

    ! initialize
    a = 1.0
    istat = cudaSetDevice(0)
    a_d = a
    ! perform test twice, timing on both sending GPU
    ! and receiving GPU
    do timingDev = 0, 1
        write(*,"(/,'Timing on device ', i0, /)") timingDev

        ! create events on the timing device
        istat = cudaSetDevice(timingDev)
        istat = cudaEventCreate(startEvent)
        istat = cudaEventCreate(stopEvent)

        if(accessPeer == 1) then
            ! enable P2P communication
            istat = cudaSetDevice(0)
            istat = cudaDeviceEnablePeerAccess(1, 0)
            istat = cudaSetDevice(1)
            istat = cudaDeviceEnablePeerAccess(0, 0)

            ! transfer(implicitly) across devices
            b_d = -1.0
            istat = cudaSetDevice(timingDev)
            istat = cudaEventRecord(startEvent,0)
            b_d = a_d
            istat = cudaEventRecord(stopEvent,0)
            istat = cudaEventSynchronize(stopEvent)
            istat = cudaEventElapsedTime(time, startEvent, stopEvent)
            b = b_d
            if(any(b /= a)) then
                write(*,"('Transfer failed ')")
            else
                write(*,"('b_d=a_d transfer(GB/s): ', f)") N*4/time/1.0E+6
            end if
        end if

        ! transfer via cudaMemcpyPeer()
        if(accessPeer == 0) istat = cudaSetDevice(1)
        b_d = -1.0

        istat = cudaSetDevice(timingDev)
        istat = cudaEventRecord(startEvent,0)
        istat = cudaMemcpyPeer(b_d, 1, a_d, 0, N)
        istat = cudaEventRecord(stopEvent,0)
        istat = cudaEventSynchronize(stopEvent)
        istat = cudaEventElapsedTime(time, startEvent, stopEvent)
        if(accessPeer == 0) istat = cudaSetDevice(1)
        b = b_d
        if(any(b /= a)) then
            write(*,"('Transfer failed ')")
        else
            write(*,"('cudaMemcpyPeer transfer(GB/s): ', f)") N*4/time/1.0E+6
        end if

        ! cudaMemcpyPeer with P2P disabled
        if(accessPeer == 1) then
            istat = cudaSetDevice(0)
            istat = cudaDeviceDisablePeerAccess(1)
            istat = cudaSetDevice(1)
            istat = cudaDeviceDisablePeerAccess(0)
            b_d = -1.0
            istat = cudaSetDevice(timingDev)
            istat = cudaEventRecord(startEvent,0)
            istat = cudaMemcpyPeer(b_d, 1, a_d, 0, N)
            istat = cudaEventRecord(stopEvent,0)
            istat = cudaEventSynchronize(stopEvent)
            istat = cudaEventElapsedTime(time, startEvent, stopEvent)
            istat = cudaSetDevice(1)
            b = b_d
            if(any(b /= a)) then
                write(*,"('Transfer failed ')")
            else
                write(*,"('cudaMemcpyPeer transfer w/ P2P ', ' disabled(GB/s): ', f)") N*4/time/1.0E+6
            end if
        end if
        
        ! destroy events associated with timingDev
        istat = cudaEventDestroy(startEvent)
        istat = cudaEventDestroy(stopEvent)
    end do

    ! clean up
    deallocate(freeBefore, totalBefore, freeAfter, totalAfter)
    deallocate(a, b, a_d, b_d)

end program directTransfer