I’m using Geforce Fermi 610M, 1 SM run on Windows 10 64bit,
Here is my runnable code:
#include <stdio.h>
#include <conio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <time.h>
#include <math.h>
#include <cuda.h>
#include <device_launch_parameters.h>
#include <cuda_runtime.h>
#include <cuda_device_runtime_api.h>
#include <cuda_profiler_api.h>
#include <device_functions.h>
int N = 1572864;
int gridsize=8;
int blocksize=192;
int *a, *aux;
int mergesort(int *a, int N);
__global__ void mergeBU(int *d_a, int *d_aux, int N, int Nthreads, int sz);
__device__ void merge(int *d_a, int *d_aux, int lo, int mid, int hi);
__global__ void mergeBU(int *d_a, int *d_aux, int N, int Nthreads, int sz)
{
int lo;
int idx = blockIdx.x*blockDim.x + threadIdx.x;
float slices = N / (2 * Nthreads*sz);
if (slices > 1.0 || slices == 1.0)
{
lo = (N / Nthreads)*idx;
for (float slice = 0; slice < slices; slice++)
{
if (lo >= N) { break; }
int mid = lo + sz - 1;
int hi = __min(lo + sz + sz - 1, N - 1);
merge(d_a, d_aux, lo, mid, hi);
lo = lo + sz + sz;
}
}
else if (slices < 1.0)
{
lo = 2 * sz * idx;
int mid = lo + sz - 1;
int hi = __min(lo + sz + sz - 1, N - 1);
merge(d_a, d_aux, lo, mid, hi);
}
}
__device__ void merge(int *d_a, int *d_aux, int lo, int mid, int hi)
{
int i = lo;
int j = mid + 1;
for (int k = lo; k <= hi; k++)
{
d_aux[k] = d_a[k];
}
for (int k = lo; k <= hi; k++)
{
if (i > mid) { d_a[k] = d_aux[j]; j++; }
else if (j > hi) { d_a[k] = d_aux[i]; i++; }
else if (d_aux[j] < d_aux[i]) { d_a[k] = d_aux[j]; j++; }
else { d_a[k] = d_aux[i]; i++; }
}
}
int randInRange(int min1, int max1)
{
double scale = 1.0 / (RAND_MAX + 1);
double range = max1 - min1 + 1;
return min1 + (int)(rand() * scale * range);
}
int mergesort(int *a, int N)
{
int Nthreads = gridsize*blocksize;
cudaError_t cudaStatus;
int *d_a, *d_aux;
cudaStatus = (cudaEventCreate(&startEvent));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(cudaStatus));
exit(EXIT_FAILURE);
}
cudaStatus = (cudaEventCreate(&stopEvent));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(cudaStatus));
exit(EXIT_FAILURE);
}
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess)
{
printf("cudaSetDevice returned error %s (code %d), line(%d)\n", cudaGetErrorString(cudaStatus), cudaStatus, __LINE__);
exit(EXIT_FAILURE);
}
cudaStatus = cudaMalloc((void**)&d_a, N * sizeof(int));
if (cudaStatus != cudaSuccess)
{
printf("cudaMalloc d_a returned error %s (code %d), line(%d)\n", cudaGetErrorString(cudaStatus), cudaStatus, __LINE__);
exit(EXIT_FAILURE);
}
cudaStatus = cudaMalloc((void**)&d_aux, N * sizeof(int));
if (cudaStatus != cudaSuccess)
{
printf("cudaMalloc d_aux returned error %s (code %d), line(%d)\n", cudaGetErrorString(cudaStatus), cudaStatus, __LINE__);
//exit(EXIT_FAILURE);
}
cudaStatus = cudaMemcpy(d_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
printf("cudaMemcpy d_a returned error %s (code %d), line(%d)\n", cudaGetErrorString(cudaStatus), cudaStatus, __LINE__);
exit(EXIT_FAILURE);
}
for (int sz = 1; sz < N; sz = sz + sz)
{
mergeBU << <gridsize, blocksize >> > (d_a, d_aux, N, Nthreads,sz);
}
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
exit(EXIT_FAILURE);
}
cudaStatus = cudaMemcpy(a, d_a, N * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess)
{
printf("cudaMemcpy (a,d_a) returned error %s (code %d), line(%d)\n", cudaGetErrorString(cudaStatus), cudaStatus, __LINE__);
exit(EXIT_FAILURE);
}
//cleanup
cudaFree(d_a);
cudaFree(d_aux);
cudaEventDestroy(startEvent);
cudaEventDestroy(stopEvent);
fclose(Culog);
return cudaStatus;
}
int main(void)
{
a = (int *)malloc(N * sizeof(int));
aux = (int *)malloc(N * sizeof(int));
// random generator
for (int i = 0; i < N; i++)
{
a[i] = randInRange(1,100);
}
mergesort(a, N);
//cleanup==============================================================
free(a);
free(aux);
system("PAUSE");
return 0;
}
I tested on various number of threads and various number of data :
<<<8,12>>> 96 threads --> max N = 48mb = 786432 elements //because of int type (@4bytes)
<<<8,24>>> 192 threads --> max N = 24mb = 1572864 elements
<<<8,48>>> 384 threads --> max N = 12mb = 3145728 elements
<<<8,96>>> 768 threads --> max N = 6mb = 6291456 elements
<<<8,192>>> 1536 threads --> max N = 3mb = 12582912 elements
Question = Why can I only invoke 48mb max at some number threads? I have 2048mb on gpu memory, how is this corresponding to that number?
If I run <<<8,192>>> on 6mb data set, it fails, it gives me error illegal memory access(code77).
Here is my ptxas info when i run on 8,192 with 6mb data set:
1> ptxas info : 0 bytes gmem
1> ptxas info : Function properties for _Z5mergePiS_iii
1> 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads
1> ptxas info : Compiling entry function '_Z7mergeBUPiS_iii' for 'sm_20'
1> ptxas info : Function properties for _Z7mergeBUPiS_iii
1> 16 bytes stack frame, 16 bytes spill stores, 32 bytes spill loads
1> ptxas info : Used 20 registers, 60 bytes cmem[0]
1> ptxas info : 0 bytes gmem
1> ptxas info : Function properties for _Z5mergePiS_iii
1> 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads
1> ptxas info : Compiling entry function '_Z7mergeBUPiS_iii' for 'sm_30'
1> ptxas info : Function properties for _Z7mergeBUPiS_iii
1> 16 bytes stack frame, 16 bytes spill stores, 32 bytes spill loads
1> ptxas info : Used 20 registers, 348 bytes cmem[0]
1> ptxas info : 0 bytes gmem
1> ptxas info : Function properties for cudaDeviceGetAttribute
1> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1> ptxas info : Function properties for _Z5mergePiS_iii
1> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1> ptxas info : Function properties for cudaMalloc
1> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1> ptxas info : Function properties for cudaOccupancyMaxActiveBlocksPerMultiprocessor
1> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1> ptxas info : Function properties for cudaGetDevice
1> 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1> ptxas info : Compiling entry function '_Z7mergeBUPiS_iii' for 'sm_35'
1> ptxas info : Function properties for _Z7mergeBUPiS_iii
1> 16 bytes stack frame, 16 bytes spill stores, 32 bytes spill loads
1> ptxas info : Used 20 registers, 348 bytes cmem[0]
1> ptxas info : Function properties for cudaFuncGetAttributes
1> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1> ptxas info : Function properties for cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
1> 40 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1> ptxas info : 0 bytes gmem
1> ptxas info : Function properties for cudaDeviceGetAttribute
1> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1> ptxas info : Function properties for _Z5mergePiS_iii
1> 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1> ptxas info : Function properties for cudaMalloc
1> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1> ptxas info : Function properties for cudaOccupancyMaxActiveBlocksPerMultiprocessor
1> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1> ptxas info : Function properties for cudaGetDevice
1> 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1> ptxas info : Compiling entry function '_Z7mergeBUPiS_iii' for 'sm_37'
1> ptxas info : Function properties for _Z7mergeBUPiS_iii
1> 16 bytes stack frame, 16 bytes spill stores, 32 bytes spill loads
1> ptxas info : Used 20 registers, 348 bytes cmem[0]
1> ptxas info : Function properties for cudaFuncGetAttributes
1> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1> ptxas info : Function properties for cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
1> 40 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1> ptxas info : 0 bytes gmem
1> ptxas info : Function properties for cudaDeviceGetAttribute
1> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1> ptxas info : Function properties for _Z5mergePiS_iii
1> 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads
1> ptxas info : Function properties for cudaMalloc
1> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1> ptxas info : Function properties for cudaOccupancyMaxActiveBlocksPerMultiprocessor
1> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1> ptxas info : Function properties for cudaGetDevice
1> 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1> ptxas info : Compiling entry function '_Z7mergeBUPiS_iii' for 'sm_50'
1> ptxas info : Function properties for _Z7mergeBUPiS_iii
1> 32 bytes stack frame, 32 bytes spill stores, 108 bytes spill loads
1> ptxas info : Used 20 registers, 348 bytes cmem[0]
1> ptxas info : Function properties for cudaFuncGetAttributes
1> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1> ptxas info : Function properties for cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
1> 40 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1> ptxas info : 0 bytes gmem
1> ptxas info : Function properties for cudaDeviceGetAttribute
1> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1> ptxas info : Function properties for _Z5mergePiS_iii
1> 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads
1> ptxas info : Function properties for cudaMalloc
1> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1> ptxas info : Function properties for cudaOccupancyMaxActiveBlocksPerMultiprocessor
1> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1> ptxas info : Function properties for cudaGetDevice
1> 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1> ptxas info : Compiling entry function '_Z7mergeBUPiS_iii' for 'sm_52'
1> ptxas info : Function properties for _Z7mergeBUPiS_iii
1> 32 bytes stack frame, 32 bytes spill stores, 108 bytes spill loads
1> ptxas info : Used 20 registers, 348 bytes cmem[0]
1> ptxas info : Function properties for cudaFuncGetAttributes
1> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1> ptxas info : Function properties for cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
1> 40 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1> ptxas info : 0 bytes gmem
1> ptxas info : Function properties for cudaDeviceGetAttribute
1> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1> ptxas info : Function properties for _Z5mergePiS_iii
1> 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads
1> ptxas info : Function properties for cudaMalloc
1> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1> ptxas info : Function properties for cudaOccupancyMaxActiveBlocksPerMultiprocessor
1> 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1> ptxas info : Function properties for cudaGetDevice
1> 8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1> ptxas info : Compiling entry function '_Z7mergeBUPiS_iii' for 'sm_60'
1> ptxas info : Function properties for _Z7mergeBUPiS_iii
1> 32 bytes stack frame, 32 bytes spill stores, 108 bytes spill loads
1> ptxas info : Used 20 registers, 348 bytes cmem[0]
1> ptxas info : Function properties for cudaFuncGetAttributes
1> 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1> ptxas info : Function properties for cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
1> 40 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
Question 1 : is it beacause i run out gpu memory? well I guess not, with that amount of stack frame, it provides me a lot more memory that I can use.
based on :
https://devtalk.nvidia.com/default/topic/642743/what-is-the-maximum-cuda-stack-frame-size-per-kerenl-/
and
http://stackoverflow.com/questions/34794481/cuda-stack-and-heap
Quetion 1.b : why is that 0 byte gmem? I’m sure that I already use cudaMalloc and cudaMemcpy to copy it onto device memory.
Then I run cuda-memcheck :
Invalid __global__ read of size 4
========= at 0x000001b8 in D:/College/Semester 9/Tugas Akhir/Skripsi/Implementasi/Koding/Mergesort/MergesortCUDA/mergesortCUDA.cu:67:merge(int*, int*, int, int, int)
========= by thread (111,0,0) in block (7,0,0)
========= Address 0x2d7900000 is out of bounds
========= Device Frame:D:/College/Semester 9/Tugas Akhir/Skripsi/Implementasi/Koding/Mergesort/MergesortCUDA/mergesortCUDA.cu:57:mergeBU(int*, int*, int, int, int) (mergeBU(int*, int*, int, int, int) : 0x768)
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:C:\Windows\system32\nvcuda.dll (cuModuleLoadDataEx + 0x193672) [0x19aef5]
========= Host Frame:D:\College\Semester 9\Tugas Akhir\Skripsi\Implementasi\Koding\Mergesort\x64\Debug\cudart64_80.dll (_cudaInitManagedRuntime + 0x3bb2) [0x5292]
========= Host Frame:D:\College\Semester 9\Tugas Akhir\Skripsi\Implementasi\Koding\Mergesort\x64\Debug\cudart64_80.dll (cudaLaunch + 0x105) [0x2e8e5]
========= Host Frame:D:\College\Semester 9\Tugas Akhir\Skripsi\Implementasi\Koding\Mergesort\x64\Debug\MergesortCUDA.exe (cudaLaunch<char> + 0x28) [0x2638]
========= Host Frame:D:\College\Semester 9\Tugas Akhir\Skripsi\Implementasi\Koding\Mergesort\x64\Debug\MergesortCUDA.exe (__device_stub__Z7mergeBUPiS_iii + 0xce) [0x250e]
========= Host Frame:D:\College\Semester 9\Tugas Akhir\Skripsi\Implementasi\Koding\Mergesort\x64\Debug\MergesortCUDA.exe (mergeBU + 0x4e) [0x21be]
========= Host Frame:D:\College\Semester 9\Tugas Akhir\Skripsi\Implementasi\Koding\Mergesort\x64\Debug\MergesortCUDA.exe (mergesort + 0x3a6) [0x1c16]
========= Host Frame:D:\College\Semester 9\Tugas Akhir\Skripsi\Implementasi\Koding\Mergesort\x64\Debug\MergesortCUDA.exe (main + 0xb3) [0x23d3]
========= Host Frame:D:\College\Semester 9\Tugas Akhir\Skripsi\Implementasi\Koding\Mergesort\x64\Debug\MergesortCUDA.exe (invoke_main + 0x34) [0x54b4]
========= Host Frame:D:\College\Semester 9\Tugas Akhir\Skripsi\Implementasi\Koding\Mergesort\x64\Debug\MergesortCUDA.exe (__scrt_common_main_seh + 0x12e) [0x536e]
========= Host Frame:D:\College\Semester 9\Tugas Akhir\Skripsi\Implementasi\Koding\Mergesort\x64\Debug\MergesortCUDA.exe (__scrt_common_main + 0xe) [0x522e]
========= Host Frame:D:\College\Semester 9\Tugas Akhir\Skripsi\Implementasi\Koding\Mergesort\x64\Debug\MergesortCUDA.exe (mainCRTStartup + 0x9) [0x54d9]
========= Host Frame:C:\Windows\system32\KERNEL32.DLL (BaseThreadInitThunk + 0x22) [0x12d92]
========= Host Frame:C:\Windows\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x34) [0x9f64]
Question 2 : How to debug that actually?
Thanks for reply.