// Jan Mandel September 2008 // Jan.Mandel@gmail.com // a simple main program for cuda testing // originally based on Fotran_Cuda from nvidia.com #include #include "cuda.h" #define real float // compute and store one number of the result __host__ __device__ void onerow(int i, int *x, int *r, real *v, real *a, real*b) #define X(i) x[i-1] #define V(i) v[i-1] #define A(i) a[i-1] #define B(i) b[i-1] #define R(i) r[i-1] { int k,j; real t; t=0.0; for (k=X(i); k< X(i+1); k++) { j=R(k); if (j>0) { t += V(k) * A(j); } } B(i)=t; } /* naive multiplication by transpose of matrix in column compressed format */ /* b=transpose(A)*a */ __global__ void spmtv_cu(int M, int N, int S, int *x, int *r, real *v, real *a, real *b) { unsigned int i = 1 + blockIdx.x*blockDim.x+threadIdx.x; if( i<=N ) { onerow( i, x, r, v, a, b); } } void cudasafe( cudaError_t err, char* str) { if (err != cudaSuccess) { printf("%s failed with error code %i\n",str,err); exit(1); } } extern "C" void cuda_device_init__( int* dev) { int deviceCount; cudasafe(cudaGetDeviceCount(&deviceCount),"cudaGetDeviceCount"); if (deviceCount == 0) { fprintf(stderr, "No devices supporting CUDA.\n"); exit(1); } int Dev = *dev; cudaDeviceProp deviceProp; cudasafe(cudaGetDeviceProperties(&deviceProp, Dev),"cudaGetDeviceProperties"); if (deviceProp.major < 1) { fprintf(stderr, "Device does not support CUDA.\n"); exit(1); } fprintf(stdout, " Device %d %s\n", Dev, deviceProp.name); cudasafe(cudaSetDevice(Dev),"cudaSetDevice"); } // Fortran subroutine arguments are passed by references. // reference implementation extern "C" void spmtv_gold__(int *m, int *n, int *s, int *x, int *r, real *v, real *a, real *b) { // b=transpose(A)*a, A is sparse matrix size m by n // entries of column j of A are stored in v(x(j)) to r(x(j+1)-1) // their row indices are stored in r(x(j)) to r(x(j+1)-1) // check: must have s>=x(n+1)-1 // all entries of r must be between 1 and M // integer x(n+1), r(s); real v(s), a(m), b(n) int N=*n; for (int i=1;i<=N; i++) { onerow( i, x, r, v, a, b); } } // Fortran subroutine arguments are passed by references. extern "C" void spmtv_cuda__(int *ifun, int *m, int *n, int *s, int *x, int *r, real *v, real *a, real *b) { // b=transpose(A)*a, A is sparse matrix size m by n // entries of column j of A are stored in v(x(j)) to r(x(j+1)-1) // their row indices are stored in r(x(j)) to r(x(j+1)-1) // check: must have s>=x(n+1)-1 // all entries of r must be between 1 and M // integer x(n+1), r(s); real v(s), a(m), b(n) int Ifun=*ifun; int N=*n; int M=*m; // int S=*s; static int *x_d, *r_d; // device arrays static real *v_d, *a_d, *b_d; int Sm; Sm=x[N]-1; int block_size=64; // Compute execution configuration dim3 dimBlock(block_size); dim3 dimGrid ((N+dimBlock.x-1)/dimBlock.x); switch(Ifun) { case 1: // Allocate arrays on device cudasafe(cudaMalloc ((void **) &x_d , sizeof(int)*(N+1)),"cudaMalloc\0"); cudasafe(cudaMalloc ((void **) &r_d , sizeof(int)*Sm),"cudaMalloc\0"); cudasafe(cudaMalloc ((void **) &v_d , sizeof(real)*Sm),"cudaMalloc\0"); cudasafe(cudaMalloc ((void **) &a_d , sizeof(real)*M),"cudaMalloc\0"); cudasafe(cudaMalloc ((void **) &b_d , sizeof(real)*N),"cudaMalloc\0"); // Copy matrix from host memory to device memory cudasafe(cudaMemcpy( x_d, x, sizeof(int)*(N+1) ,cudaMemcpyHostToDevice),"cudaMemcpy\0"); cudasafe(cudaMemcpy( r_d, r, sizeof(int)*Sm ,cudaMemcpyHostToDevice),"cudaMemcpy\0"); cudasafe(cudaMemcpy( v_d, v, sizeof(real)*Sm ,cudaMemcpyHostToDevice),"cudaMemcpy\0"); break; case 2: // copy vector from hodst to device cudasafe(cudaMemcpy( a_d, a, sizeof(real)*M ,cudaMemcpyHostToDevice),"cudaMemcpy\0"); break; case 3: // Execute the kernel spmtv_cu <<>>(M, N, Sm, x_d, r_d, v_d, a_d, b_d); break; case 4: // Copy the result back to host cudasafe(cudaMemcpy( b, b_d, sizeof(real)*N,cudaMemcpyDeviceToHost),"cudaMemcpy\0"); break; case 5: // Free memory on the device cudaFree(b_d); cudaFree(a_d); cudaFree(v_d); cudaFree(r_d); cudaFree(x_d); break; default: printf("spmtv_cuda: bad Ifun %i\n",Ifun); abort(); } return; }