Hi everyone,
I’m new to CUDA development, currently facing a bad error message while working on a simple summation matrix kernel. My development environment follow: Ubuntu 15.04, CUDA 7.0 and GTX 960.
Please find here some code:
#include "common.h"
#include <cuda_runtime.h>
#include <stdio.h>
/*
* This example implements matrix element-wise addition on the host and GPU.
* sumMatrixOnHost iterates over the rows and columns of each matrix, adding
* elements from A and B together and storing the results in C. The current
* offset in each matrix is stored using pointer arithmetic. sumMatrixOnGPU2D
* implements the same logic, but using CUDA threads to process each matrix.
*/
void initialData(float *ip, const int size)
{
int i;
for(i = 0; i < size; i++)
{
ip[i] = (float)( rand() & 0xFF ) / 10.0f;
}
}
void sumMatrixOnHost(float *A, float *B, float *C, const int nx, const int ny)
{
float *ia = A;
float *ib = B;
float *ic = C;
for (int iy = 0; iy < ny; iy++)
{
for (int ix = 0; ix < nx; ix++)
{
ic[ix] = ia[ix] + ib[ix];
}
ia += nx;
ib += nx;
ic += nx;
}
return;
}
void checkResult(float *hostRef, float *gpuRef, const int N)
{
double epsilon = 1.0E-8;
for (int i = 0; i < N; i++)
{
if (abs(hostRef[i] - gpuRef[i]) > epsilon)
{
printf("host %f gpu %f ", hostRef[i], gpuRef[i]);
printf("Arrays do not match.\n\n");
break;
}
}
}
// grid 2D block 2D
__global__ void sumMatrixOnGPU2D(float *A, float *B, float *C, int NX, int NY)
{
unsigned int ix = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int iy = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int idx = iy * NX + ix;
if (ix < NX && iy < NY)
{
C[idx] = A[idx] + B[idx];
}
}
int main(int argc, char **argv)
{
// set up device
int dev = 0;
cudaDeviceProp deviceProp;
CHECK(cudaGetDeviceProperties(&deviceProp, dev));
CHECK(cudaSetDevice(dev));
// set up data size of matrix
int nx = 1 << 14;
int ny = 1 << 14;
int nxy = nx * ny;
int nBytes = nxy * sizeof(float);
// malloc host memory
float *h_A, *h_B, *hostRef, *gpuRef;
h_A = (float *)malloc(nBytes);
h_B = (float *)malloc(nBytes);
hostRef = (float *)malloc(nBytes);
gpuRef = (float *)malloc(nBytes);
// initialize data at host side
size_t iStart = seconds();
initialData(h_A, nxy);
initialData(h_B, nxy);
size_t iElaps = seconds() - iStart;
memset(hostRef, 0, nBytes);
memset(gpuRef, 0, nBytes);
// add matrix at host side for result checks
iStart = seconds();
sumMatrixOnHost (h_A, h_B, hostRef, nx, ny);
iElaps = seconds() - iStart;
// malloc device global memory
float *d_MatA, *d_MatB, *d_MatC;
CHECK(cudaMalloc((void **)&d_MatA, nBytes));
CHECK(cudaMalloc((void **)&d_MatB, nBytes));
CHECK(cudaMalloc((void **)&d_MatC, nBytes));
// transfer data from host to device
CHECK(cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice));
// invoke kernel at host side
int dimx = 32;
int dimy = 32;
if(argc > 2)
{
dimx = atoi(argv[1]);
dimy = atoi(argv[2]);
}
dim3 block(dimx, dimy);
dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y);
// execute the kernel
CHECK(cudaDeviceSynchronize());
iStart = seconds();
sumMatrixOnGPU2D<<<grid, block>>>(d_MatA, d_MatB, d_MatC, nx, ny);
CHECK(cudaDeviceSynchronize());
iElaps = seconds() - iStart;
printf("sumMatrixOnGPU2D <<<(%d,%d), (%d,%d)>>> elapsed %d ms\n", grid.x,
grid.y,
block.x, block.y, iElaps);
CHECK(cudaGetLastError());
// copy kernel result back to host side
CHECK(cudaMemcpy(gpuRef, d_MatC, nBytes, cudaMemcpyDeviceToHost));
// check device results
checkResult(hostRef, gpuRef, nxy);
// free device global memory
CHECK(cudaFree(d_MatA));
CHECK(cudaFree(d_MatB));
CHECK(cudaFree(d_MatC));
// free host memory
free(h_A);
free(h_B);
free(hostRef);
free(gpuRef);
// reset device
CHECK(cudaDeviceReset());
return EXIT_SUCCESS;
}
and
#include <sys/time.h>
#ifndef _COMMON_H
#define _COMMON_H
#define CHECK(call) \
{ \
const cudaError_t error = call; \
if (error != cudaSuccess) \
{ \
fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__); \
fprintf(stderr, "code: %d, reason: %s\n", error, \
cudaGetErrorString(error)); \
} \
}
#define CHECK_CUBLAS(call) \
{ \
cublasStatus_t err; \
if ((err = (call)) != CUBLAS_STATUS_SUCCESS) \
{ \
fprintf(stderr, "Got CUBLAS error %d at %s:%d\n", err, __FILE__, \
__LINE__); \
exit(1); \
} \
}
#define CHECK_CURAND(call) \
{ \
curandStatus_t err; \
if ((err = (call)) != CURAND_STATUS_SUCCESS) \
{ \
fprintf(stderr, "Got CURAND error %d at %s:%d\n", err, __FILE__, \
__LINE__); \
exit(1); \
} \
}
#define CHECK_CUFFT(call) \
{ \
cufftResult err; \
if ( (err = (call)) != CUFFT_SUCCESS) \
{ \
fprintf(stderr, "Got CUFFT error %d at %s:%d\n", err, __FILE__, \
__LINE__); \
exit(1); \
} \
}
#define CHECK_CUSPARSE(call) \
{ \
cusparseStatus_t err; \
if ((err = (call)) != CUSPARSE_STATUS_SUCCESS) \
{ \
fprintf(stderr, "Got error %d at %s:%d\n", err, __FILE__, __LINE__); \
cudaError_t cuda_err = cudaGetLastError(); \
if (cuda_err != cudaSuccess) \
{ \
fprintf(stderr, " CUDA error \"%s\" also detected\n", \
cudaGetErrorString(cuda_err)); \
} \
exit(1); \
} \
}
inline double seconds()
{
struct timeval tp;
struct timezone tzp;
int i = gettimeofday(&tp, &tzp);
return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
}
#endif // _COMMON_H
Everything compiled as here:
nvcc -O3 -arch=sm_20 sumMatrix.cu -o sumMatrix
This is what I get trying to execute it:
./sumMatrix 32 32
Error: sumMatrix.cu:111, code: 2, reason: out of memory
Error: sumMatrix.cu:112, code: 2, reason: out of memory
Error: sumMatrix.cu:116, code: 11, reason: invalid argument
Error: sumMatrix.cu:135, code: 77, reason: an illegal memory access was encountered
sumMatrixOnGPU2D <<<(512,512), (32,32)>>> elapsed 0.00 ms
Error: sumMatrix.cu:140, code: 77, reason: an illegal memory access was encountered
Error: sumMatrix.cu:143, code: 77, reason: an illegal memory access was encountered
host 22.500000 gpu 0.000000 Arrays do not match.
To be honest, I just can’t figure out what’s happening !
Any suggestions ?
Thank you in advance.