1. Who can start CUDA programming? – Any one who has some basicknowledge on C programming. No prior knowledge of graphics programming.
– CUDA is C with minimal extension and some restrictions.( CUDA is C for GPU)
2. If you have a algo that you want to write in CUDA then follow the below steps. 3. Start writing your algo in C/C++. 4. Changing your implemented C/C++ program according to CUDA requirements. 5. The basic diagram for CUDA code implementation as: Allocate CPU memory >> Allocate same amount of GPU memory >> take data input in CPU memory >> copy data into GPU memory >> doing processing in GPU memory >> copying final data in CPU memory. 6. Explainig simple example that calculate cosine of range 0 to 1024.
7. C implementation of algo.
void vcos( int n, float* x, float* y )
{
for ( int i = 0; i < n; ++i)
y[i] = cos( x[i] );
}
int main()
{
float *host_x, *host_y;
int n = 1024;
host_x = (float*)malloc( n*sizeof(float) );
host_y = (float*)malloc( n*sizeof(float) );
for( int i = 0; i < 1024; ++i )
host_x[i] = (float)i;
vcos( n, host_x, host_y );
free(host_x);
free(host_y);
return( 0 );
}
8 . CUDA implementation of Same algo( what should be minimal changes in C code. )
__global__ void vcos( int n, float* x, float* y )
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
y[i] = cos( x[i] );
}
int main()
{
float *host_x, *host_y;
float *dev_x, *dev_y;
int n = 1024;
host_x = (float*)malloc( n*sizeof(float) );
host_y = (float*)malloc( n*sizeof(float) );
cudaMalloc( &dev_x, n*sizeof(float) );
cudaMalloc( &dev_y, n*sizeof(float) );
for( int i = 0; i < 1024; ++i )
host_x[i] = (float) i;
/* fill host_x[i] with data here */
cudaMemcpy( dev_x, host_x, n*sizeof(float), cudaMemcpyHostToDevice );
/* launch 1 thread per vector-element, 256 threads per block */
bk = (int)( n / 256 );
vcos( n, dev_x, dev_y );
cudaMemcpy( host_y, dev_y, n*sizeof(float), cudaMemcpyDeviceToHost );
/* host_y now contains cos(x) data */
free(host_x);
free(host_y);
cudaFree(dev_x);
cudaFree(dev_y);
return( 0 );
}
