2.1 Data Parallelism
2.2 CUDA C Program Structure
A vector addition kernel
void vecAdd(float* A_h, float* B_h, float* C_h, int n) {
for (int i = 0; i < n; ++i) {
C_h[i] = A_h[i] + B_h[i];
}
}
int main() {
// allocate memory on host, etc
vecAdd(A, B, C, N);
// ...
}
void vecAdd(float* A_h, float* B_h, float* C_h, int n) {
int size = n * sizeof(float);
float *A_d, *B_d, *C_d;
// Part 1: allocate memory on device and copy from host
// Part 2: actually do the addition
// Part 3: copy the result back to host and free memory on device
}
2.4 Device Global Memory and Data Transfer
cudaMalloc(ptr_address, size)
and cudaFree(ptr)
functions, where size
is the number of required bytes, ptr_address
is the address of a pointer to either put stuff or free stuff from, and ptr
is a pointermalloc
and free
in regular C, except malloc
only takes a size
input, not a pointer as wellcudaMemcpy(destination, source, size, <transfer_type>)
, where <transfer_type>
is either cudaMemcpyHostToDevice
, cudaMemcpyDeviceToHost
, or similar for Host-Host transfer or Device-Device transfer2.5 Kernel Functions and Threading
blockIdx
(which block are you in), blockDim
(how many threads are in the block), and threadIdx
(which thread within the block is this). each of these are themselves structs which have three fields x
, y
, and z
, which are like coordinates in case of multidimensional data (i.e., if you are only dealing with 1d vectors, you only need blockIdx.x
to specify your block). then the overall thread can be identified by blockIdx.x * blockDim.x + threadIdx.x
.__host__
(code executed on host and called by host–this is just a regular C function, and not providing a header defaults to __host__
), __device__
(code executed on device and called on-device), or __global__
(code executed on-device and called from either host or device)
__global__
void vecAddKernel(float* A, float* B, float* C, int n) {
// get the thread id
int i = blockIdx.x * blockDim.x + threadIdx.x;
// since you launch threads in a grid divided into blocks,
// you can't order "partial" blocks, so you can end up with extra threads
// so you have to make sure the thread id actually fits within the vector
// limits
if (i < n) {
C[i] = A[i] + B[i];
}
}
__host__
and __device__
and two copies of the function will be created, one for host execution and one for device execution2.6 Calling Kernel Functions
<<<
and >>>
, e.g., vecAddKernel<<<n_blocks, block_size>>>(A, B, C, n)
2.7 Compilation