Error increasing problem size using CUDA

I’m using Geforce Fermi 610M, 1 SM run on Windows 10 64bit,
Here is my runnable code:

#include <stdio.h>
#include <conio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <time.h>
#include <math.h>
#include <cuda.h>
#include <device_launch_parameters.h>
#include <cuda_runtime.h>
#include <cuda_device_runtime_api.h>
#include <cuda_profiler_api.h>
#include <device_functions.h>

int N = 1572864;
int gridsize=8; 
int blocksize=192; 

int *a, *aux;

int mergesort(int *a, int N);
__global__ void mergeBU(int *d_a, int *d_aux, int N, int Nthreads, int sz);
__device__ void merge(int *d_a, int *d_aux, int lo, int mid, int hi);

__global__ void mergeBU(int *d_a, int *d_aux, int N, int Nthreads, int sz)
{
	int lo;
	int idx = blockIdx.x*blockDim.x + threadIdx.x;
	float slices = N / (2 * Nthreads*sz);

	if (slices > 1.0 || slices == 1.0)
	{
		lo = (N / Nthreads)*idx;
		for (float slice = 0; slice < slices; slice++)
		{
			if (lo >= N) { break; }
			int mid = lo + sz - 1;
			int hi = __min(lo + sz + sz - 1, N - 1);
			merge(d_a, d_aux, lo, mid, hi);
			lo = lo + sz + sz;
		}
	}

	else if (slices < 1.0)
	{
		lo = 2 * sz * idx;
		int mid = lo + sz - 1;
		int hi = __min(lo + sz + sz - 1, N - 1);
		merge(d_a, d_aux, lo, mid, hi);
	}	
}

__device__ void merge(int *d_a, int *d_aux, int lo, int mid, int hi)
{
	int i = lo;
	int j = mid + 1;

	for (int k = lo; k <= hi; k++)
	{
		d_aux[k] = d_a[k];
	}

	for (int k = lo; k <= hi; k++)
	{
		if	(i > mid)	        { d_a[k] = d_aux[j]; j++; }
		else if (j > hi)		{ d_a[k] = d_aux[i]; i++; }
		else if (d_aux[j] < d_aux[i])	{ d_a[k] = d_aux[j]; j++; }
		else				{ d_a[k] = d_aux[i]; i++; }
	}
}

int randInRange(int min1, int max1)
{
	double scale = 1.0 / (RAND_MAX + 1);
	double range = max1 - min1 + 1;
	return min1 + (int)(rand() * scale * range);
}

int mergesort(int *a, int N)
{	
	
	int Nthreads = gridsize*blocksize;

	cudaError_t cudaStatus;
	int *d_a, *d_aux;
	
	cudaStatus = (cudaEventCreate(&startEvent));
	if (cudaStatus != cudaSuccess)
	{
		fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(cudaStatus));
		exit(EXIT_FAILURE);
	}

	cudaStatus = (cudaEventCreate(&stopEvent));
	if (cudaStatus != cudaSuccess)
	{
		fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(cudaStatus));
		exit(EXIT_FAILURE);
	}

	cudaStatus = cudaSetDevice(0);
	if (cudaStatus != cudaSuccess)
	{
		printf("cudaSetDevice returned error %s (code %d), line(%d)\n", cudaGetErrorString(cudaStatus), cudaStatus, __LINE__);
		exit(EXIT_FAILURE);
	}

	cudaStatus = cudaMalloc((void**)&d_a, N * sizeof(int));
	if (cudaStatus != cudaSuccess)
	{
		printf("cudaMalloc d_a returned error %s (code %d), line(%d)\n", cudaGetErrorString(cudaStatus), cudaStatus, __LINE__);
		exit(EXIT_FAILURE);
	}

	cudaStatus = cudaMalloc((void**)&d_aux, N * sizeof(int));
	if (cudaStatus != cudaSuccess)
	{
		printf("cudaMalloc d_aux returned error %s (code %d), line(%d)\n", cudaGetErrorString(cudaStatus), cudaStatus, __LINE__);
		//exit(EXIT_FAILURE);
	}

	cudaStatus = cudaMemcpy(d_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess)
	{
		printf("cudaMemcpy d_a returned error %s (code %d), line(%d)\n", cudaGetErrorString(cudaStatus), cudaStatus, __LINE__);
		exit(EXIT_FAILURE);
	}

	for (int sz = 1; sz < N; sz = sz + sz)
	{
		mergeBU << <gridsize, blocksize >> > (d_a, d_aux, N, Nthreads,sz);
	}
	
	cudaStatus = cudaGetLastError();
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
		exit(EXIT_FAILURE);
	}

	cudaStatus = cudaMemcpy(a, d_a, N * sizeof(int), cudaMemcpyDeviceToHost);
	if (cudaStatus != cudaSuccess)
	{
		printf("cudaMemcpy (a,d_a) returned error %s (code %d), line(%d)\n", cudaGetErrorString(cudaStatus), cudaStatus, __LINE__);
		exit(EXIT_FAILURE);
	}

	//cleanup
	cudaFree(d_a);
	cudaFree(d_aux);
	cudaEventDestroy(startEvent);
	cudaEventDestroy(stopEvent);
	fclose(Culog);
	return cudaStatus;
}

int main(void)
{

	a	= (int *)malloc(N * sizeof(int));
	aux     = (int *)malloc(N * sizeof(int));

        // random generator
	for (int i = 0; i < N; i++)
	{
	a[i] = randInRange(1,100);
	}

        mergesort(a, N);

	//cleanup==============================================================
	free(a);
	free(aux);
	system("PAUSE");
	return 0;
}

I tested on various number of threads and various number of data :

<<<8,12>>>  96   threads   --> max N = 48mb = 786432 elements  //because of int type (@4bytes)
<<<8,24>>>  192  threads   --> max N = 24mb = 1572864 elements
<<<8,48>>>  384  threads   --> max N = 12mb = 3145728 elements
<<<8,96>>>  768  threads   --> max N = 6mb  = 6291456 elements
<<<8,192>>> 1536 threads   --> max N = 3mb  = 12582912 elements

Question = Why can I only invoke 48mb max at some number threads? I have 2048mb on gpu memory, how is this corresponding to that number?

If I run <<<8,192>>> on 6mb data set, it fails, it gives me error illegal memory access(code77).

Here is my ptxas info when i run on 8,192 with 6mb data set:

1>         ptxas info    : 0 bytes gmem
1>         ptxas info    : Function properties for _Z5mergePiS_iii
1>             8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads
1>         ptxas info    : Compiling entry function '_Z7mergeBUPiS_iii' for 'sm_20'
1>         ptxas info    : Function properties for _Z7mergeBUPiS_iii
1>             16 bytes stack frame, 16 bytes spill stores, 32 bytes spill loads
1>         ptxas info    : Used 20 registers, 60 bytes cmem[0]
1>         ptxas info    : 0 bytes gmem
1>         ptxas info    : Function properties for _Z5mergePiS_iii
1>             8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads
1>         ptxas info    : Compiling entry function '_Z7mergeBUPiS_iii' for 'sm_30'
1>         ptxas info    : Function properties for _Z7mergeBUPiS_iii
1>             16 bytes stack frame, 16 bytes spill stores, 32 bytes spill loads
1>         ptxas info    : Used 20 registers, 348 bytes cmem[0]
1>         ptxas info    : 0 bytes gmem
1>         ptxas info    : Function properties for cudaDeviceGetAttribute
1>             16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1>         ptxas info    : Function properties for _Z5mergePiS_iii
1>             0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1>         ptxas info    : Function properties for cudaMalloc
1>             16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1>         ptxas info    : Function properties for cudaOccupancyMaxActiveBlocksPerMultiprocessor
1>             32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1>         ptxas info    : Function properties for cudaGetDevice
1>             8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1>         ptxas info    : Compiling entry function '_Z7mergeBUPiS_iii' for 'sm_35'
1>         ptxas info    : Function properties for _Z7mergeBUPiS_iii
1>             16 bytes stack frame, 16 bytes spill stores, 32 bytes spill loads
1>         ptxas info    : Used 20 registers, 348 bytes cmem[0]
1>         ptxas info    : Function properties for cudaFuncGetAttributes
1>             16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1>         ptxas info    : Function properties for cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
1>             40 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1>         ptxas info    : 0 bytes gmem
1>         ptxas info    : Function properties for cudaDeviceGetAttribute
1>             16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1>         ptxas info    : Function properties for _Z5mergePiS_iii
1>             0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1>         ptxas info    : Function properties for cudaMalloc
1>             16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1>         ptxas info    : Function properties for cudaOccupancyMaxActiveBlocksPerMultiprocessor
1>             32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1>         ptxas info    : Function properties for cudaGetDevice
1>             8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1>         ptxas info    : Compiling entry function '_Z7mergeBUPiS_iii' for 'sm_37'
1>         ptxas info    : Function properties for _Z7mergeBUPiS_iii
1>             16 bytes stack frame, 16 bytes spill stores, 32 bytes spill loads
1>         ptxas info    : Used 20 registers, 348 bytes cmem[0]
1>         ptxas info    : Function properties for cudaFuncGetAttributes
1>             16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1>         ptxas info    : Function properties for cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
1>             40 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1>         ptxas info    : 0 bytes gmem
1>         ptxas info    : Function properties for cudaDeviceGetAttribute
1>             16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1>         ptxas info    : Function properties for _Z5mergePiS_iii
1>             8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads
1>         ptxas info    : Function properties for cudaMalloc
1>             16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1>         ptxas info    : Function properties for cudaOccupancyMaxActiveBlocksPerMultiprocessor
1>             32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1>         ptxas info    : Function properties for cudaGetDevice
1>             8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1>         ptxas info    : Compiling entry function '_Z7mergeBUPiS_iii' for 'sm_50'
1>         ptxas info    : Function properties for _Z7mergeBUPiS_iii
1>             32 bytes stack frame, 32 bytes spill stores, 108 bytes spill loads
1>         ptxas info    : Used 20 registers, 348 bytes cmem[0]
1>         ptxas info    : Function properties for cudaFuncGetAttributes
1>             16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1>         ptxas info    : Function properties for cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
1>             40 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1>         ptxas info    : 0 bytes gmem
1>         ptxas info    : Function properties for cudaDeviceGetAttribute
1>             16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1>         ptxas info    : Function properties for _Z5mergePiS_iii
1>             8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads
1>         ptxas info    : Function properties for cudaMalloc
1>             16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1>         ptxas info    : Function properties for cudaOccupancyMaxActiveBlocksPerMultiprocessor
1>             32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1>         ptxas info    : Function properties for cudaGetDevice
1>             8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1>         ptxas info    : Compiling entry function '_Z7mergeBUPiS_iii' for 'sm_52'
1>         ptxas info    : Function properties for _Z7mergeBUPiS_iii
1>             32 bytes stack frame, 32 bytes spill stores, 108 bytes spill loads
1>         ptxas info    : Used 20 registers, 348 bytes cmem[0]
1>         ptxas info    : Function properties for cudaFuncGetAttributes
1>             16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1>         ptxas info    : Function properties for cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
1>             40 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1>         ptxas info    : 0 bytes gmem
1>         ptxas info    : Function properties for cudaDeviceGetAttribute
1>             16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1>         ptxas info    : Function properties for _Z5mergePiS_iii
1>             8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads
1>         ptxas info    : Function properties for cudaMalloc
1>             16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1>         ptxas info    : Function properties for cudaOccupancyMaxActiveBlocksPerMultiprocessor
1>             32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1>         ptxas info    : Function properties for cudaGetDevice
1>             8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1>         ptxas info    : Compiling entry function '_Z7mergeBUPiS_iii' for 'sm_60'
1>         ptxas info    : Function properties for _Z7mergeBUPiS_iii
1>             32 bytes stack frame, 32 bytes spill stores, 108 bytes spill loads
1>         ptxas info    : Used 20 registers, 348 bytes cmem[0]
1>         ptxas info    : Function properties for cudaFuncGetAttributes
1>             16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1>         ptxas info    : Function properties for cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
1>             40 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads

Question 1 : is it beacause i run out gpu memory? well I guess not, with that amount of stack frame, it provides me a lot more memory that I can use.
based on :
https://devtalk.nvidia.com/default/topic/642743/what-is-the-maximum-cuda-stack-frame-size-per-kerenl-/
and
http://stackoverflow.com/questions/34794481/cuda-stack-and-heap

Quetion 1.b : why is that 0 byte gmem? I’m sure that I already use cudaMalloc and cudaMemcpy to copy it onto device memory.

Then I run cuda-memcheck :

Invalid __global__ read of size 4
=========     at 0x000001b8 in D:/College/Semester 9/Tugas Akhir/Skripsi/Implementasi/Koding/Mergesort/MergesortCUDA/mergesortCUDA.cu:67:merge(int*, int*, int, int, int)
=========     by thread (111,0,0) in block (7,0,0)
=========     Address 0x2d7900000 is out of bounds
=========     Device Frame:D:/College/Semester 9/Tugas Akhir/Skripsi/Implementasi/Koding/Mergesort/MergesortCUDA/mergesortCUDA.cu:57:mergeBU(int*, int*, int, int, int) (mergeBU(int*, int*, int, int, int) : 0x768)
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:C:\Windows\system32\nvcuda.dll (cuModuleLoadDataEx + 0x193672) [0x19aef5]
=========     Host Frame:D:\College\Semester 9\Tugas Akhir\Skripsi\Implementasi\Koding\Mergesort\x64\Debug\cudart64_80.dll (_cudaInitManagedRuntime + 0x3bb2) [0x5292]
=========     Host Frame:D:\College\Semester 9\Tugas Akhir\Skripsi\Implementasi\Koding\Mergesort\x64\Debug\cudart64_80.dll (cudaLaunch + 0x105) [0x2e8e5]
=========     Host Frame:D:\College\Semester 9\Tugas Akhir\Skripsi\Implementasi\Koding\Mergesort\x64\Debug\MergesortCUDA.exe (cudaLaunch<char> + 0x28) [0x2638]
=========     Host Frame:D:\College\Semester 9\Tugas Akhir\Skripsi\Implementasi\Koding\Mergesort\x64\Debug\MergesortCUDA.exe (__device_stub__Z7mergeBUPiS_iii + 0xce) [0x250e]
=========     Host Frame:D:\College\Semester 9\Tugas Akhir\Skripsi\Implementasi\Koding\Mergesort\x64\Debug\MergesortCUDA.exe (mergeBU + 0x4e) [0x21be]
=========     Host Frame:D:\College\Semester 9\Tugas Akhir\Skripsi\Implementasi\Koding\Mergesort\x64\Debug\MergesortCUDA.exe (mergesort + 0x3a6) [0x1c16]
=========     Host Frame:D:\College\Semester 9\Tugas Akhir\Skripsi\Implementasi\Koding\Mergesort\x64\Debug\MergesortCUDA.exe (main + 0xb3) [0x23d3]
=========     Host Frame:D:\College\Semester 9\Tugas Akhir\Skripsi\Implementasi\Koding\Mergesort\x64\Debug\MergesortCUDA.exe (invoke_main + 0x34) [0x54b4]
=========     Host Frame:D:\College\Semester 9\Tugas Akhir\Skripsi\Implementasi\Koding\Mergesort\x64\Debug\MergesortCUDA.exe (__scrt_common_main_seh + 0x12e) [0x536e]
=========     Host Frame:D:\College\Semester 9\Tugas Akhir\Skripsi\Implementasi\Koding\Mergesort\x64\Debug\MergesortCUDA.exe (__scrt_common_main + 0xe) [0x522e]
=========     Host Frame:D:\College\Semester 9\Tugas Akhir\Skripsi\Implementasi\Koding\Mergesort\x64\Debug\MergesortCUDA.exe (mainCRTStartup + 0x9) [0x54d9]
=========     Host Frame:C:\Windows\system32\KERNEL32.DLL (BaseThreadInitThunk + 0x22) [0x12d92]
=========     Host Frame:C:\Windows\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x34) [0x9f64]

Question 2 : How to debug that actually?

Thanks for reply.

Try starting with the method here:

http://stackoverflow.com/questions/27277365/unspecified-launch-failure-on-memcpy/27278218#27278218

That will localize the “Invalid global read of size 4” to a single line of kernel code that is causing the out-of-bounds access. Once you’ve determined that, if you need additional clarity, careful use of in-kernel printf may help with getting any additional information you need.