offsetNStride #
Description #
Demonstration of how coalesced access to GPU global memory i.e., accessing memory in strides of 16 (half-warp) or 32 (warp) can reduce the number of transactions made and reduce run times.
My NVIDIA 1650 behaves like K20 and C2050 used in the book, where 0-stride accesses are fastest, and everything else is worse (by only a little).
Code (C++) #
#include <stdio.h>
#include <precision.h>
__global__ void offset(userfp_t* a, int s) {
int i = blockDim.x*blockIdx.x + threadIdx.x + s;
a[i] += 1.0;
}
__global__ void stride(userfp_t* a, int s) {
int i = (blockDim.x * blockIdx.x + threadIdx.x)*s;
a[i] += 1.0;
}
int main() {
const int nMB = 4, n = nMB*1024*1024/sizeof(userfp_t), blockSize = 256;
const size_t nbytes = n*sizeof(userfp_t);
userfp_t *a_d, *b_d;
cudaDeviceProp prop;
cudaEvent_t startEvent, stopEvent;
float time;
// array dimensions are 33*n for stride cases
cudaMalloc(&a_d, 33*nbytes);
cudaMalloc(&b_d, 33*nbytes);
cudaGetDeviceProperties(&prop, 0);
printf("Device: %s\n", prop.name);
printf("Transfer size (MB): %d\n", nMB);
if (sizeof(userfp_t)==sizeof(double)) {
printf("Double Precision\n");
} else {
printf("Single Precision\n");
}
cudaEventCreate(&startEvent);
cudaEventCreate(&stopEvent);
printf("Offset, Bandwidth (GB/s):\n");
offset<<<n/blockSize, blockSize>>>(b_d, 0);
for (int i = 0; i < 33; ++i) {
cudaMemset(&a_d, 0, 33*nbytes);
cudaEventRecord(startEvent, 0);
offset<<<n/blockSize, blockSize>>>(a_d, i);
cudaEventRecord(stopEvent, 0);
cudaEventSynchronize(stopEvent);
cudaEventElapsedTime(&time, startEvent, stopEvent);
printf("%d %f\n", i, 2.0*nbytes/time*1.e-6);
}
printf("\nStrid, Bandwidth (GB/s):\n");
stride<<<n/blockSize, blockSize>>>(b_d, 1);
for (int i = 1; i < 33; ++i) {
cudaMemset(&a_d, 0, 33*nbytes);
cudaEventRecord(startEvent, 0);
stride<<<n/blockSize, blockSize>>>(a_d, i);
cudaEventRecord(stopEvent, 0);
cudaEventSynchronize(stopEvent);
cudaEventElapsedTime(&time, startEvent, stopEvent);
printf("%d %f\n", i, 2.0*nbytes/time*1.e-6);
}
cudaFree(a_d);
cudaFree(b_d);
cudaEventDestroy(startEvent);
cudaEventDestroy(stopEvent);
}
Code (Fortran) #
module kernels_m
use precision_m
contains
attributes(global) subroutine offset(a, s)
real(f):: a(*)
integer, value:: s
integer:: i
i = blockDim%x*(blockIdx%x-1)+ threadIdx%x + s
a(i) = a(i) + 1._f
end subroutine offset
attributes(global) subroutine stride(a, s)
real(f):: a(*)
integer, value:: s
integer:: i
i = (blockDim%x*(blockIdx%x-1)+ threadIdx%x) * s
a(i) = a(i) + 1._f
end subroutine stride
end module kernels_m
program offsetNStride
use cudafor
use kernels_m
implicit none
integer, parameter:: nMB = 4 ! transfer size in MB
integer, parameter:: n = nMB*1024*1024/f
integer, parameter:: blockSize = 256
! array dimensions are 33*n for stride cases
real(f), device:: a_d(33*n), b_d(33*n)
type(cudaEvent):: startEvent, stopEvent
type(cudaDeviceProp):: prop
integer:: i, istat
real:: time
istat = cudaGetDeviceProperties(prop, 0)
write(*,'(/," Device: ",a)') trim(prop%name)
write(*,'(" Transfer size (MB): ",i0)') nMB
if (kind(a_d) == sf) then
write(*,'(a,/)') 'Single Precision '
else
write(*,'(a,/)') 'Double Precision '
end if
istat = cudaEventCreate(startEvent)
istat = cudaEventCreate(stopEvent)
write(*,*) 'Offset, Bandwidth (GB/s):'
call offset <<<n/blockSize,blockSize >>>(b_d, 0)
do i = 0, 32
a_d = 0._f
istat = cudaEventRecord(startEvent,0)
call offset <<<n/blockSize,blockSize >>>(a_d, i)
istat = cudaEventRecord(stopEvent,0)
istat = cudaEventSynchronize(stopEvent)
istat = cudaEventElapsedTime(time, startEvent, stopEvent)
write(*,*) i, 2*n*f/time *1.e-6
end do
write(*,*)
write(*,*) 'Stride, Bandwidth (GB/s):'
call stride <<<n/blockSize,blockSize >>>(b_d, 1)
do i = 1, 32
a_d = 0._f
istat = cudaEventRecord(startEvent,0)
call stride <<<n/blockSize,blockSize >>>(a_d, i)
istat = cudaEventRecord(stopEvent,0)
istat = cudaEventSynchronize(stopEvent)
istat = cudaEventElapsedTime(time, startEvent, stopEvent)
write(*,*) i, 2*n*f/time*1.e-6
end do
istat = cudaEventDestroy(startEvent)
istat = cudaEventDestroy(stopEvent)
end program offsetNStride