Compiling Code:
> nvcc -O3 -arch sm_30 --ptxas-options=-v -lineinfo skalar.cu
-o main.NVCC_
> ./main.NVCC_
> optirun ./main.NVCC_
optirun
is obsolete
with most recent kernels; Jan. 2015).How to CUDA-parallelize the inner product:
Original code for inner product:double scalar(const int N, const double x[], const double y[]) { double sum = 0.0; for (int i=0; i<N; ++i) { sum += x[i]*y[i]; } return sum; }
int main()
{
...
double s = scalar(n,a,b);
...
}
// kernel function on device __global__ void scalar(double *sg, const double *x, const double *y, int N) { extern __shared__ double sdata[]; const int idx = blockIdx.x * blockDim.x + threadIdx.x; const int str = gridDim.x*blockDim.x; double s = 0.0; for (int i = idx; i < N; i += str) s+= x[i]*y[i]; sdata[threadIdx.x] = s; __syncthreads(); for (int s=blockDim.x >> 1; s>0; s>>=1) { if (threadIdx.x < s) { sdata[threadIdx.x] += sdata[threadIdx.x + s]; } __syncthreads(); } if (threadIdx.x == 0) sg[blockIdx.x] = sdata[0]; } // kernel function for addition on o n e block on device __global__ void add_1_block(double *s, int N) { if (N>blockDim.x) return; extern __shared__ double sdata[]; const int tid = threadIdx.x; if (tid<N) sdata[tid]=s[tid]; else sdata[tid]=0.0; __syncthreads(); for (int s=blockDim.x/2; s>0; s>>=1) { if (tid < s) { sdata[tid] += sdata[tid + s]; } __syncthreads(); } if (tid == 0) s[0] = sdata[0]; }
//-------------------------------------------------------------
// Host function. Inner product calculated with device vectors
double dscapr_GPU(const int N, const double x_d[], const double y_d[]) { cudaDeviceProp prop; cudaGetDeviceProperties (&prop, 0); const int blocksize = 4 * 64, gridsize = prop.multiProcessorCount, sBytes = blocksize * sizeof(double); dim3 dimBlock(blocksize); dim3 dimGrid(gridsize); double sg, *s_d; // device vector storing the partial sums cudaMalloc((void **) &s_d, blocksize * sizeof(double)); // temp. memory on device // call the kernel function with dimGrid.x * dimBlock.x threads scalar <<< dimGrid, dimBlock, sBytes>>>(s_d, x_d, y_d, N); // power of 2 as number of treads per block const unsigned int oneBlock = 2 << (int)ceil(log(dimGrid.x + 0.0) / log(2.0)); add_1_block <<< 1, oneBlock, oneBlock *sizeof(double)>>>(s_d, dimGrid.x); // copy data: device --> host cudaMemcpy(&sg, s_d, sizeof(double), cudaMemcpyDeviceToHost); cudaFree(s_d); return sg; }
//-------------------------------------------------------------
// main function on host int main() { double *x_h, *y_h; // host data double *x_d, *y_d; // device data double sum, tstart, tgpu; const int N = 14000000, nBytes = N*sizeof(double); const int LOOP = 100; cout << endl << gridsize << " x " << blocksize << " Threads\n"; x_h = new double [N]; y_h = new double [N]; // allocate memory on device cudaMalloc((void **) &x_d, nBytes); cudaMalloc((void **) &y_d, nBytes); for (int i=0; i<N; i++) { x_h[i] = (i % 137)+1; y_h[i] = 1.0/x_h[i];} // copy data: host --> device cudaMemcpy(x_d, x_h, nBytes, cudaMemcpyHostToDevice); cudaMemcpy(y_d, y_h, nBytes, cudaMemcpyHostToDevice);
for (int k=0; k<LOOP; ++k)
{
sum = dscapr_GPU(N, x_d, y_d); } delete [] x_h; delete [] y_h; cudaFree(x_d); cudaFree(y_d); return 0; }
>
nvcc
-O3 -arch sm_30
--ptxas-options=-v -lineinfo skalar.cu -o main.NVCC_
> ./main.NVCC_
Compilation problem with CUDA 6.5 and gcc-4.9 (and higher) in Ubuntu 14.10 [Feb. 2015]: