dim3
, e.g., for 2D data:
int N = 50; // say, a 50 x 50 matrix
dim3 blockDim(16, 16, 1); // use 16 x 16 blocks of threads
//ceil(50/16) = 4 -> 64-50=14 extra threads
int nGridRows = ceil(N / (float)blockDim.y);
int nGridCols = ceil(N / (float)blockDim.x); //
dim3 gridDim(nGridCols, nGridRows, 1);
matmul.cu
)