p2pBandwidth #
Description #
Program to measure the bandwidth of memory transfer between a GPUs on a multi-GPU host.
A useful technique shown here, is the use of derived types with device member arrays to manage each GPU’s instance of the device arrays.
Code (C++) #
#include <stdio.h>
#include <cstdlib>
struct distributedArray {
float* a_d;
};
__global__ void setVal(float* __restrict__ array, float val) {
int i = threadIdx.x + blockIdx.x * blockDim.x;
array[i] = val;
}
int main() {
const int N = 4*1024*1024;
const size_t nbytes = N*sizeof(float);
// displaying number of GPUs
int nDevices;
cudaGetDeviceCount(&nDevices);
// displaying GPU names
cudaDeviceProp prop;
for (int i = 0; i < nDevices; ++i) {
cudaGetDeviceProperties(&prop, i);
printf("Device %d: %s\n", i, prop.name);
}
printf("\n");
distributedArray distArray[nDevices];
// enable p2p access between GPUs (if possible)
int access;
for (int j = 0; j < nDevices; ++j) {
// sets GPU
cudaSetDevice(j);
// allocate device array on the set GPU
cudaMalloc(&distArray[j].a_d, nbytes);
setVal<<<N/1024, 1024>>>(distArray[j].a_d, j);
for (int i = j+1; i < nDevices; ++i) {
cudaDeviceCanAccessPeer(&access, j, i);
if (access == 1) {
cudaSetDevice(j);
cudaDeviceEnablePeerAccess(i, 0);
cudaSetDevice(i);
cudaDeviceEnablePeerAccess(j, 0);
}
}
}
// allocating array to record inter-GPU bandwidths
float bandwidth[nDevices][nDevices];
for (int i = 0; i < nDevices; ++i) {
for (int j = 0; j < nDevices; ++j) {
bandwidth[i][j] = 0.0;
}
}
// measure p2p bandwidth between each pair of GPUs
cudaEvent_t startEvent, stopEvent;
float time;
float *array = (float*)malloc(nbytes);
for (int j = 0; j < nDevices; ++j) {
cudaSetDevice(j);
cudaEventCreate(&startEvent);
cudaEventCreate(&stopEvent);
for (int i = 0; i < nDevices; ++i) {
if (i == j) continue;
cudaEventRecord(startEvent, 0);
cudaMemcpyPeer(distArray[j].a_d, j, distArray[i].a_d, i, nbytes);
cudaEventRecord(stopEvent, 0);
cudaEventSynchronize(stopEvent);
cudaEventElapsedTime(&time, startEvent, stopEvent);
cudaMemcpy(array, distArray[j].a_d, nbytes, cudaMemcpyDeviceToHost);
for (int k = 0; k < N; ++k) {
if (array[k] != i) {
printf("Transfer between GPUs %d and %d failed!\n", j, i);
std::exit(1);
}
}
bandwidth[j][i] = N*sizeof(float)/time/1.0e6;
}
setVal<<<N/1024, 1024>>>(distArray[j].a_d, j);
cudaEventDestroy(startEvent);
cudaEventDestroy(stopEvent);
}
printf("Bandwidth (GB/s) for transfer size (MB): %f\n", nbytes/1024.0/1024.0);
printf(" S\\R 0");
for (int i = 1; i < nDevices; ++i) printf(" %3d", i);
printf("\n");
for (int j = 0; j < nDevices; ++j) {
printf("%3d", j);
for (int i = 0; i < nDevices; ++i) {
if (i==j) {
printf(" 0 ");
} else {
printf("%8.2f", bandwidth[j][i]);
}
}
printf("\n");
}
// cleanup
for (int j = 0; j < nDevices; ++j) {
cudaFree(distArray[j].a_d);
}
}
Code (Fortran) #
program p2pBandwidth
use cudafor
implicit none
integer, parameter:: N = 4*1024*1024
type distributedArray
real, device, allocatable:: a_d(:)
end type distributedArray
type(distributedArray), allocatable:: distArray(:)
real, allocatable:: bandwidth(:,:)
real:: array(N), time
integer:: nDevices, access, i, j, istat
type(cudaDeviceProp):: prop
type(cudaEvent):: startEvent, stopEvent
! displaying number of GPUs
istat = cudaGetDeviceCount(nDevices)
write(*,"('Number of CUDA-capable devices: ', i0,/)") nDevices
! displaying GPU names
do i = 0, nDevices-1
istat = cudaGetDeviceProperties(prop, i)
write(*,"('Device ', i0, ': ', a)") i, trim(prop%name)
end do
write(*,*)
! creating device array for each GPU
allocate(distArray(0:nDevices-1))
! enable p2p access between GPUs (if possible)
do j = 0, nDevices-1
istat = cudaSetDevice(j) ! sets GPU
allocate(distArray(j)%a_d(N)) ! allocates device array on the set GPU
distArray(j)%a_d = j
do i = j+1, nDevices-1
istat = cudaDeviceCanAccessPeer(access, j, i)
if (access == 1) then
istat = cudaSetDevice(j)
istat = cudaDeviceEnablePeerAccess(i, 0)
istat = cudaSetDevice(i)
istat = cudaDeviceEnablePeerAccess(j, 0)
end if
end do
end do
! allocating array to record inter-GPU bandwidths
allocate(bandwidth(0:nDevices-1, 0:nDevices-1))
bandwidth(:,:) = 0.
! measure p2p bandwidth between each pair of GPUs
do j = 0, nDevices-1
istat = cudaSetDevice(j)
istat = cudaEventCreate(startEvent)
istat = cudaEventCreate(stopEvent)
do i = 0, nDevices-1
if (i == j) cycle
istat = cudaMemcpyPeer(distArray(j)%a_d, j, distArray(i)%a_d, i, N)
istat = cudaEventRecord(startEvent, 0)
istat = cudaMemcpyPeer(distArray(j)%a_d, j, distArray(i)%a_d, i, N)
istat = cudaEventRecord(stopEvent, 0)
istat = cudaEventSynchronize(stopEvent)
istat = cudaEventElapsedTime(time, startEvent, stopEvent)
array = distArray(j)%a_d
if (all(array == i)) bandwidth(j,i) = N*4/time/1.0E+6
end do
distArray(j)%a_d = j
istat = cudaEventDestroy(startEvent)
istat = cudaEventDestroy(stopEvent)
end do
write(*,"('Bandwidth (GB/s) for transfer size (MB): ', f9.3,/)") N*4./1024**2
write(*,"(' S\\R 0')", advance='no')
do i = 1, nDevices-1
write(*,"(5x,i3)", advance='no') i
end do
write(*,*)
do j = 0, nDevices-1
write(*,"(i3)", advance='no') j
do i = 0, nDevices-1
if (i==j) then
write(*,"(4x,'0',3x)", advance='no')
else
write(*,"(f8.2)", advance='no') bandwidth(j,i)
end if
end do
write(*,*)
end do
! cleanup
do j = 0, nDevices-1
deallocate(distArray(j)%a_d)
end do
deallocate(distArray, bandwidth)
end program p2pBandwidth