CUDA Programming

.cu coding

Lecture :
24F EE514 Parallel Computing
by KAIST Minsoo Rhu VIA Research Group

SPMD Programming

CUDA Programming

// define kernel func.
__global__ void vecAddKernel(float* A, float* B, float* C, int n)
{
  // global rank
  int i = threadIdx.x + blockDim.x * blockIdx.x;
  if (i < n)
    C[i] = A[i] + B[i];
}

// n : global rank
dim3 DimGrid((n-1)/256 + 1, 1, 1);
// 256 threads per block
// DimBlock.x = 256
dim3 DimBlock(256, 1, 1);

// call kernel func.
// kernel func.<<<#block, #thread>>>(param.)
vecAddKernel<<<DimGrid, DimBlock>>>(d_A, d_B, d_C, n);

Tiled Matrix Multiplication

__global__ void MatrixMulKernel(float* M, float* N, float* P, int Width){
  int row = blockIdx.y * blockDim.y + threadIdx.y;

  int col = blockIdx.x * blockDim.x + threadIdx.x;

  if ((row < Width) && (col < Width)){
    float value = 0;
    // each thread computes one element of output matrix
    for (int k = 0; k < Width; ++k){
      value += M[row * Width + k] * N[k * Width + col];
    }
    
    P[row * Width + col] = value;
  }
}