Hello!
I’m working on molecular dinamics project, and want to speed up the calculations of acceleration.
When i run kernel with 1177 blocks and 16 threads per block I faced an exception below.
If i reduce blocks/thread_per_block number then program runs correctly.
Exception thrown at 0x00D79C23 (nvcuda.dll) in Proj_gpu_Accel.exe: 0xC0000005: Access violation reading location 0x00A21000.
Device:
Quadro 3000M
Heres th piece of code:
Kernel:
__global__ void Accel_gpu(double *d_x, double *d_y, double *d_ax, double *d_ay,
int N, double Lx, double Ly) {
//int idx = threadIdx.x;
int idx = threadIdx.x + blockIdx.x * blockDim.x;
double dx, dy, temp;
int sgn_dx, sgn_dy;
double *r = new double[N];
double *g = new double[N];
//----------------------------
//some calculations
Function that I used to call kernel:
void Accel_use_gpu(double *x, double *y, double *ax, double *ay, int N, double Lx, double Ly) {
const int Bytes_for_Threads = 6 * (N) * sizeof(double);
double *d_x, *d_y, *d_ax, *d_ay;
cudaMalloc((void**)&d_x, Bytes_for_Threads);
cudaMalloc((void**)&d_y, Bytes_for_Threads);
cudaMalloc((void**)&d_ax, Bytes_for_Threads);
cudaMalloc((void**)&d_ay, Bytes_for_Threads);
cudaMemcpy(d_x, x, Bytes_for_Threads, cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, Bytes_for_Threads, cudaMemcpyHostToDevice);
cudaMemcpy(d_ax, ax, Bytes_for_Threads, cudaMemcpyHostToDevice);
cudaMemcpy(d_ay, ay, Bytes_for_Threads, cudaMemcpyHostToDevice);
Accel_gpu << < 1177, 16 >> > (d_x, d_y, d_ax, d_ay, N, Lx, Ly);
cudaMemcpy(ax, d_ax, Bytes_for_Threads, cudaMemcpyDeviceToHost);
cudaMemcpy(ay, d_ay, Bytes_for_Threads, cudaMemcpyDeviceToHost);
cudaFree(d_x); cudaFree(d_y); cudaFree(d_ax); cudaFree(d_ay);
}
Please, help me, how can i increase the amount of machining memory, my Quadro3000M have
Max threads per block = 1024 and Max grid size = 65535.