limitingFactor #

Description #

Code to test whether computation or memory transfer is the bottleneck. Compiled program intended to be run with nvprof.

The book demonstrates the effect of compiling with -Mcuda=fastmath, which shows a significant speedup in the “base” and “math” kernels (note they use very old C2050 and K20 GPUs).

Code (C++) #

#include <stdio.h>

__global__ void base(float *a, float *b) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    a[i] = sin(b[i]);
}

__global__ void memory(float *a, float *b) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    a[i] = b[i];
}

__global__ void math(float *a, float b, int flag) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    float v = sin(b);
    if (v*flag == 1.0) a[i] = v;
}

// this exists because cudaMemSet is weird
__global__ void setval(float *a, float val) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    a[i] = val;
}

int main() {

    const int n = 8*1024*1024, blockSize = 256;
    
    float *a, *a_d, *b_d;
    a = (float*)malloc(n*sizeof(float));

    cudaMalloc(&a_d, n*sizeof(float));
    cudaMalloc(&b_d, n*sizeof(float));
    setval<<<n/blockSize, blockSize>>>(b_d, 1.0);

    base<<<n/blockSize, blockSize>>>(a_d, b_d);
    memory<<<n/blockSize, blockSize>>>(a_d, b_d);
    math<<<n/blockSize, blockSize>>>(a_d, 1.0, 0);

    cudaMemcpy(a_d, a, n*sizeof(float), cudaMemcpyDeviceToHost);
    printf("%f\n", a[0]);

}

Code (Fortran) #

module kernel_m
contains

    attributes(global) subroutine base(a,b)
        real:: a(*), b(*)
        integer:: i
        i = (blockIdx%x-1) * blockDim%x + threadIdx%x
        a(i) = sin(b(i))
    end subroutine base

    attributes(global) subroutine memory(a,b)
        real:: a(*), b(*)
        integer:: i
        i = (blockIdx%x-1) * blockDim%x + threadIdx%x
        a(i) = b(i)
    end subroutine memory

    attributes(global) subroutine math(a, b, flag)
        real:: a(*)
        real, value:: b
        integer, value:: flag
        real:: v
        integer:: i
        i = (blockIdx%x-1)*blockDim%x + threadIdx%x
        v = sin(b)
        if (v*flag == 1) a(i) = v
    end subroutine math

end module kernel_m

program limitingFactor

    use cudafor
    use kernel_m 

    implicit none
    integer, parameter:: n=8*1024*1024, blockSize=256
    real:: a(n)
    real, device:: a_d(n), b_d(n)    

    b_d = 1.    

    call base<<<n/blockSize, blockSize>>>(a_d, b_d)
    call memory<<<n/blockSize, blockSize>>>(a_d, b_d)
    call math<<<n/blockSize, blockSize>>>(a_d, 1.0, 0)

    a = a_d

    write(*,*) a(1)
			
end program limitingFactor