1. Who can start CUDA programming? – Any one who has some basicknowledge on C programming. No prior knowledge of graphics programming.
– CUDA is C with minimal extension and some restrictions.( CUDA is C for GPU)
2. If you have a algo that you want to write in CUDA then follow the below steps. 3. Start writing your algo in C/C++. 4. Changing your implemented C/C++ program according to CUDA requirements. 5. The basic diagram for CUDA code implementation as: Allocate CPU memory >> Allocate same amount of GPU memory >> take data input in CPU memory >> copy data into GPU memory >> doing processing in GPU memory >> copying final data in CPU memory. 6. Explainig simple example that calculate cosine of range 0 to 1024.
7. C implementation of algo.
void vcos( int n, float* x, float* y ) { for ( int i = 0; i < n; ++i) y[i] = cos( x[i] ); } int main() { float *host_x, *host_y; int n = 1024; host_x = (float*)malloc( n*sizeof(float) ); host_y = (float*)malloc( n*sizeof(float) ); for( int i = 0; i < 1024; ++i ) host_x[i] = (float)i; vcos( n, host_x, host_y ); free(host_x); free(host_y); return( 0 ); } 8 . CUDA implementation of Same algo( what should be minimal changes in C code. ) __global__ void vcos( int n, float* x, float* y ) { int i = blockIdx.x * blockDim.x + threadIdx.x; y[i] = cos( x[i] ); } int main() { float *host_x, *host_y; float *dev_x, *dev_y; int n = 1024; host_x = (float*)malloc( n*sizeof(float) ); host_y = (float*)malloc( n*sizeof(float) ); cudaMalloc( &dev_x, n*sizeof(float) ); cudaMalloc( &dev_y, n*sizeof(float) ); for( int i = 0; i < 1024; ++i ) host_x[i] = (float) i; /* fill host_x[i] with data here */ cudaMemcpy( dev_x, host_x, n*sizeof(float), cudaMemcpyHostToDevice ); /* launch 1 thread per vector-element, 256 threads per block */ bk = (int)( n / 256 ); vcos( n, dev_x, dev_y ); cudaMemcpy( host_y, dev_y, n*sizeof(float), cudaMemcpyDeviceToHost ); /* host_y now contains cos(x) data */ free(host_x); free(host_y); cudaFree(dev_x); cudaFree(dev_y); return( 0 ); }