Hi,
I have GT 540M in my laptop. Concurent kernels demo from SDK sample works.
In my application I have few CPU threads, each calls kernel functions.
Part of every thread code:
CUDA_SAFE_CALL( cudaStreamCreate( &m_stream ) );
CUDA_SAFE_CALL( cudaGetDeviceProperties(&deviceProp, device) );
CUDA_SAFE_CALL( cudaMalloc( (void**) &d_corr, corr_size * 3));
CUDA_SAFE_CALL( cudaMalloc( (void**) &d_img, img_size));
CUDA_SAFE_CALL( cudaMalloc( (void**) &d_pattern, 2*ss*sizeof(int)));
CUDA_SAFE_CALL( cudaMemcpyAsync( d_img, h_img, img_size, cudaMemcpyHostToDevice, m_stream) );
CUDA_SAFE_CALL( cudaMemcpyAsync( d_pattern, h_pattern, 2*ss*sizeof(int), cudaMemcpyHostToDevice, m_stream) );
CUDA_SAFE_CALL( cudaMemsetAsync( d_corr, 0, corr_size * 3, m_stream));
dim3 grid( 1, 1, 1);
dim3 threads(threadsNum, 1, 1);
CorrExtrGpu<<< grid, threads, 0, m_stream >>>(
d_img,
img.SizeX(),
img.SizeY(),
d_corr,
m_binNo,
m_off,
d_pattern,
(int)(2 * ss));
CUT_CHECK_ERROR("Kernel ExtrGpu execution failed");
CUDA_SAFE_CALL( cudaMemcpyAsync( h_corr, d_corr, corr_size * 3, cudaMemcpyDeviceToHost, m_stream) );
cudaStreamSynchronize(m_stream);
CUDA_SAFE_CALL( cudaFree(d_img) );
CUDA_SAFE_CALL( cudaFree(d_pattern) );
CUDA_SAFE_CALL( cudaFree(d_corr) );
CUDA_SAFE_CALL( cudaStreamDestroy( m_stream ) );
And I made (in purpose testing) in kernel dummy loop, single kernel execution time is about 1.5 sec.
And there is no is no concurrency. If I change m_stream to 0 in calls summary application work time is the same. During tests I ran 8 CPU threads.
Why? Where am I making the mistake?
PS. Profiler says, that kernel time is about 93% of GPU execution time, so memcpy’s are no problem for me.