strideTexture #

Description #

A demonstration showing how the use of textured memory pointers can improve strided global memory access.

I find that using textured memory pointers didn’t improve anything reliably on my NVIDIA 1650.

The deprecation is also why a C++ version is not provided here.

Code (Fortran) #

module kernels_m

    real, texture, pointer:: aTex (:)

contains

    attributes(global) subroutine stride(b, a, s)

        real:: b(*), a(*)
        integer, value:: s
        integer:: i, is
        i = blockDim%x*( blockIdx%x-1)+ threadIdx%x
        is = (blockDim%x*( blockIdx%x-1)+ threadIdx%x) * s
        b(i) = a(is)+1

    end subroutine stride

    attributes(global) subroutine strideTex(b, s)

        real:: b(*)
        integer, value:: s
        integer:: i, is
        i = blockDim%x*( blockIdx%x-1)+ threadIdx%x
        is = (blockDim%x*( blockIdx%x-1)+ threadIdx%x) * s
        b(i) = aTex(is)+1

    end subroutine strideTex

end module kernels_m

program strideTexture

    use cudafor
    use kernels_m

    implicit none
    integer, parameter:: nMB = 4 ! transfer size in MB
    integer, parameter:: n = nMB *1024*1024/4
    integer, parameter:: blockSize = 256
    real, device, allocatable, target:: a_d(:), b_d (:)
    type(cudaEvent):: startEvent, stopEvent
    type(cudaDeviceProp):: prop
    integer:: i, istat, ib
    real:: time

    istat = cudaGetDeviceProperties(prop, 0)
    write(*,'(/," Device: ",a)') trim(prop%name)
    write(*,'(" Transfer size (MB): ",i0,/)') nMB

    allocate(a_d(n*33), b_d(n))

    istat = cudaEventCreate(startEvent)
    istat = cudaEventCreate(stopEvent)

    write(*,*) 'Global version '
    write(*,*) 'Stride, Bandwidth (GB/s)'
    call stride<<<n/blockSize,blockSize>>>(b_d, a_d, 1)
    do i = 1, 32
        a_d = 0.0
        istat = cudaEventRecord(startEvent,0)
        call stride<<<n/blockSize, blockSize>>>(b_d, a_d, i)
        istat = cudaEventRecord(stopEvent,0)
        istat = cudaEventSynchronize(stopEvent)
        istat = cudaEventElapsedTime(time, startEvent, stopEvent)
        write(*,*) i, 2*n*4/time*1.e-6
    enddo

    ! bind the texture
    aTex => a_d

    write(*,*) 'Texture version '
    write(*,*) 'Stride, Bandwidth (GB/s)'
    call strideTex<<<n/blockSize,blockSize>>>(b_d, 1)
    do i = 1, 32
        a_d = 0.0
        istat = cudaEventRecord(startEvent,0)
        call strideTex<<<n/blockSize,blockSize>>>(b_d, i)
        istat = cudaEventRecord(stopEvent,0)
        istat = cudaEventSynchronize(stopEvent)
        istat = cudaEventElapsedTime(time, startEvent, stopEvent)
        write(*,*) i, 2*n*4/time*1.e-6
    enddo

    ! unbind the texture
    nullify(aTex)
    istat = cudaEventDestroy(startEvent)
    istat = cudaEventDestroy(stopEvent)
    deallocate(a_d, b_d)

end program strideTexture