// Jan Mandel September 2008 // Jan.Mandel@gmail.com // a simple program for cuda testing // originally based on Fotran_Cuda from nvidia.com #include #include "cuda.h" #include "cuda_util.h" #define real float #define X(i,j) x[i-1+ldim*(j-1)] #define R(i,j) r[i-1+ldim*(j-1)] #define Y(i,j) y[i-1+ldim*(j-1)] // compute and store one number of the result // generates two codes from the same source __host__ __device__ void onerow(real *out, int i, int j, int ldim, real *x, real rhs, real omega) // compute one number in the 5-point laplace formula // out out the result // i,j in index of the entry to compute (from 1) // x in approx solution array dimension m n, stored columnwise // r in right-hand side array dimension m n, stored columnwise { real res; res=rhs-(X(i,j)-0.25*(X(i-1,j)+X(i+1,j)+X(i,j-1)+X(i,j+1))); *out=X(i,j)+omega*res; // fprintf(stdout, "onerow: %d %d %g %g %g %g %g\n",i,j,*out,rhs,X(i,j),res,omega); // fprintf(stdout," %g %g %g\n %g %g %g\n %g %g %g\n",X(i-1,j-1),X(i-1,j),X(i-1,j+1), \ // X(i,j-1),X(i,j),X(i,j+1),X(i+1,j-1),X(i+1,j),X(i+1,j+1)); // emulation only } // reference C implementation using the onerow // Fortran callable subroutine arguments are passed by reference extern "C" void laplace_c__(int *M, int *N, int *LDIM, real *x, real *r, real *y, real *OMEGA, int *NITER) { int m=*M; int n=*N; int ldim = *LDIM; real omega = *OMEGA; int niter = *NITER; for (int it=1; it<=niter; it++) { for (int j=2; j<=n-1; j++) { for (int i=2; i<=m-1; i++) { onerow(&Y(i,j),i, j, ldim, x, R(i,j), omega); } } for (int j=2; j<=n-1; j++) { for (int i=2; i<=m-1; i++) { onerow(&X(i,j),i, j, ldim, y, R(i,j), omega); } } } } // laplace_c // ******************* // simple version // ******************* // CUDA kernel - executes in parallel __global__ void laplace_cu(int m, int n, int ldim, real *y, real *x, real *r, real omega) { unsigned int i = 1 + blockIdx.x*blockDim.x+threadIdx.x; // where am I, from 0 unsigned int j = 1 + blockIdx.y*blockDim.y+threadIdx.y; if( i>1 && i1 && jx_d\0"); fprintf(stdout,"laplace_cuda: copy r\n"); cudasafe(cudaMemcpy( r_d, r, sizem ,cudaMemcpyHostToDevice),"cudaMemcpy r->r_d\0"); fprintf(stdout,"laplace_cuda: copy y\n"); cudasafe(cudaMemcpy( y_d, y, sizem ,cudaMemcpyHostToDevice),"cudaMemcpy y->y_d\0"); fprintf(stdout,"laplace_cuda: copy done\n"); break; case 2: // Execute the cuda kernel fprintf(stdout,"block size %d %d \n",dimBlock.x,dimBlock.y); fprintf(stdout,"grid size %d %d \n",dimGrid.x,dimGrid.y); for (int it=1; it<=niter; it++) { laplace_cu <<>>(m, n, ldim, y_d, x_d, r_d, omega); checkCUDAError("kernel launch"); laplace_cu <<>>(m, n, ldim, x_d, y_d, r_d, omega); checkCUDAError("kernel launch"); } // note the work array y_d was copied to the device because of boundary conditions // but it does not need to be copied back to the host break; case 3: // Copy the result back to host cudasafe(cudaMemcpy( x, x_d, sizem, cudaMemcpyDeviceToHost),"cudaMemcpy\0"); // Free memory on the device cudaFree(x_d); cudaFree(y_d); cudaFree(r_d); break; default: // always check - never fail silently printf("laplace_cuda: bad ifun %i\n",ifun); abort(); } return; } // add laplace_xxxxx_cuda functions and their kernels here // Fortran - subroutine arguments are passed by references. extern "C" void laplace_cuda__(int *IFUN, int *M, int *N, int *LDIM, \ real *x, real *r, real *y, real *OMEGA, int *NITER, int *METHOD, int *OVER) { int ifun = *IFUN; int m = *M; int n = *N; int ldim = *LDIM; real omega = *OMEGA; int niter = *NITER; int method = *METHOD; *OVER = 0; switch(method) { case 1: // the straightforward method laplace_cuda(ifun, m, n, ldim, x, r, y, omega, niter); break; // add other cases as desired // suggest copy the kernel and cuda routine under different names and modify // case 2: // // blocks in shared memory // laplace_shared_cuda(ifun, m, n, ldim, x, r, y, omega, niter); // break; default: // always check - never fail silently *OVER = 1; } }