Can't get any concurrency on simple vector add across multi-GPU and streams
  1 / 2    
I have built a small test case to better understand asynchronous data transfer and execution.
The problem is that I have tried every possible combination to achieve concurency:
loop_over(memcpy,kernel,memcpy).
loop_over(memcpy) then loop_over(kernel)then loop_over(memcpy).
And still I can't get any concurrency on both multi-GPU AND streams in each GPU.
Multi-GPU works great and they are concurrent,but I can't get it to overlap computation and data-transfer.
I am attaching my code so you can help me:
[code]
#include <stdio.h>
#include <stdlib.h>
// #include <cutil_inline.h>
#include <cuda.h>
#include <my_cuda_lib/my_lib.cu>
#include <my_cuda_lib/timer.h>
#include "helpers.cu"
#define DEBUG

__global__ void vectorAddGPU(float *a, float *b, float *c, int N)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx < N)
c[idx] = a[idx] + b[idx];
}

// Allocate generic memory with malloc() and pin it laster instead of using cudaHostAlloc()


int main(int argc, char *argv[])
{
int nstreams_per_device=atoi(argv[2]);
int n, nelem;
int numGPUs;
cudaGetDeviceCount(&numGPUs);
printf("CUDA-capable device count: %i\n", numGPUs);
cudaStream_t streams_array[numGPUs*nstreams_per_device];
TGPUplan * plan;
plan=(TGPUplan *)malloc(numGPUs*sizeof(TGPUplan));
printf("plans created successfully \n");

unsigned int flags;
size_t bytes;
float *a, *b, *c; // Pinned memory allocated on the CPU
// Device pointers for mapped memory
#ifdef DEBUG
float errorNorm, refNorm, ref, diff;
#endif

/* Allocate mapped CPU memory. */

nelem = atoi(argv[1])*1048576/4;
printf("Total number of elements in each of the three arrays : %d \n",nelem);
bytes = nelem*sizeof(float);
printf("Total number of bytes in each of the three arrays : %d \n",bytes);
timer total_time;

int ntiled=nelem/numGPUs;
int ntiled2=ntiled/nstreams_per_device;

timer gpu_malloc_timer;
MY_SAFE_CALL(cudaMallocHost(&a, bytes));
MY_SAFE_CALL(cudaMallocHost(&b, bytes));
MY_SAFE_CALL(cudaMallocHost(&c, bytes));
float total_gpu_malloc=gpu_malloc_timer.milliseconds_elapsed();

for(int i = 0; i < numGPUs; i++){
MY_SAFE_CALL( cudaSetDevice(i) );
MY_SAFE_CALL(cudaMalloc(&plan[i].d_a, ntiled*sizeof(float)));
MY_SAFE_CALL(cudaMalloc(&plan[i].d_b, ntiled*sizeof(float)));
MY_SAFE_CALL(cudaMalloc(&plan[i].d_c, ntiled*sizeof(float)));

}

/* Initialize the vectors. */

for(n = 0; n < nelem; n++)
{
a[n] = rand() / (float)RAND_MAX;
b[n] = rand() / (float)RAND_MAX;
}
printf("init arrays \n");
/* Get the device pointers for the pinned CPU memory mapped into the GPU
memory space. */
timer total_gpu_timer;

/* Call the GPU kernel using the device pointers for the mapped memory. */
for(int i = 0; i < numGPUs; i++){
for (int j=0;j<nstreams_per_device;j++){
//for each stream in the device we create a stream
// cudaStream_t * stream_ptr=&((plan[i].streams_array[i]));
MY_SAFE_CALL(cudaStreamCreate(&streams_array[i*nstreams_per_device+j]) );
}
}

dim3 block(256);
dim3 grid((unsigned int)ceil(ntiled2/(float)block.x));
timer kernel_timer;
for (int j=0;j<nstreams_per_device;j++){
for(int i = 0; i < numGPUs; i++){
MY_SAFE_CALL( cudaSetDevice(i) );
MY_SAFE_CALL(cudaMemcpyAsync( plan[i].d_a+j*ntiled2,a+i*ntiled+j*ntiled2 , ntiled2*sizeof(float), cudaMemcpyHostToDevice,streams_array[i*nstreams_per_device+j] ));
}
}
for (int j=0;j<nstreams_per_device;j++){
for(int i = 0; i < numGPUs; i++){
// myCudaGetLastError();
MY_SAFE_CALL(cudaMemcpyAsync( plan[i].d_b+j*ntiled2,b+i*ntiled+j*ntiled2 , ntiled2*sizeof(float), cudaMemcpyHostToDevice,streams_array[i*nstreams_per_device+j] ));
}
}

for (int j=0;j<nstreams_per_device;j++){
for(int i = 0; i < numGPUs; i++){
MY_SAFE_CALL( cudaSetDevice(i) );
vectorAddGPU<<<grid, block>>>(plan[i].d_a+j*ntiled2,plan[i].d_b+j*ntiled2,plan[i].d_c+j*ntiled2, ntiled2);
}
}
// myCudaGetLastError();
for (int j=0;j<nstreams_per_device;j++){
for(int i = 0; i < numGPUs; i++){
MY_SAFE_CALL( cudaSetDevice(i) );
MY_SAFE_CALL(cudaMemcpyAsync( c+i*ntiled+j*ntiled2, plan[i].d_c+j*ntiled2, ntiled2*sizeof(float), cudaMemcpyDeviceToHost,streams_array[i*nstreams_per_device+j] ));
}
}
float kernel_time=kernel_timer.milliseconds_elapsed();


// /* Compare the results */
// for ( int i=0; i<numGPUs; i++ )
// {
// MY_SAFE_CALL( cudaSetDevice(i) );
// cudaDeviceSynchronize();
// // cudaStreamSynchronize(streams_array[i]);
// }

#ifdef DEBUG
printf("Checking the results...\n");
errorNorm = 0.f;
refNorm = 0.f;
for(n = 0; n < nelem; n++)
{
ref = a[n] + b[n];
diff = c[n] - ref;
errorNorm += diff*diff;
refNorm += ref*ref;
}
errorNorm = (float)sqrt((double)errorNorm);
refNorm = (float)sqrt((double)refNorm);
#endif
/* Memory clean up */


timer gpu_free_timer;
MY_SAFE_CALL(cudaFreeHost(a));
MY_SAFE_CALL(cudaFreeHost(b));
MY_SAFE_CALL(cudaFreeHost(c));
float total_gpu_free=gpu_free_timer.milliseconds_elapsed();
float total_gpu_time=total_gpu_timer.milliseconds_elapsed();



float total_time_ms=total_time.milliseconds_elapsed();
printf("total time elapsed was %f \n",total_time_ms);
printf("total mallocGPU time elapsed was %f \n",total_gpu_malloc);
printf("total kernel time elapsed was %f \n",kernel_time);
printf("total gpu free time elapsed was %f \n",total_gpu_free);
printf("total gpu time elapsed was %f \n",total_gpu_time+kernel_time+total_gpu_free);

#ifdef DEBUG
printf("%s\n", (errorNorm/refNorm < 1.e-6f) ? "PASSED" : "FAILED");
#endif


}
[/code]

You can substitute my_safe_call by:

#define my_safe_call(x) x

I have red:
http://www.pgroup.com/lit/articles/insider/v3n1a4.htm

as well as the webinar but nothing seems to work.

Any help would be appreciated.

Apostolis
I have built a small test case to better understand asynchronous data transfer and execution.

The problem is that I have tried every possible combination to achieve concurency:

loop_over(memcpy,kernel,memcpy).

loop_over(memcpy) then loop_over(kernel)then loop_over(memcpy).

And still I can't get any concurrency on both multi-GPU AND streams in each GPU.

Multi-GPU works great and they are concurrent,but I can't get it to overlap computation and data-transfer.

I am attaching my code so you can help me:



#include <stdio.h>

#include <stdlib.h>

// #include <cutil_inline.h>

#include <cuda.h>

#include <my_cuda_lib/my_lib.cu>

#include <my_cuda_lib/timer.h>

#include "helpers.cu"

#define DEBUG



__global__ void vectorAddGPU(float *a, float *b, float *c, int N)

{

int idx = blockIdx.x*blockDim.x + threadIdx.x;

if (idx < N)

c[idx] = a[idx] + b[idx];

}



// Allocate generic memory with malloc() and pin it laster instead of using cudaHostAlloc()





int main(int argc, char *argv[])

{

int nstreams_per_device=atoi(argv[2]);

int n, nelem;

int numGPUs;

cudaGetDeviceCount(&numGPUs);

printf("CUDA-capable device count: %i\n", numGPUs);

cudaStream_t streams_array[numGPUs*nstreams_per_device];

TGPUplan * plan;

plan=(TGPUplan *)malloc(numGPUs*sizeof(TGPUplan));

printf("plans created successfully \n");



unsigned int flags;

size_t bytes;

float *a, *b, *c; // Pinned memory allocated on the CPU

// Device pointers for mapped memory

#ifdef DEBUG

float errorNorm, refNorm, ref, diff;

#endif



/* Allocate mapped CPU memory. */



nelem = atoi(argv[1])*1048576/4;

printf("Total number of elements in each of the three arrays : %d \n",nelem);

bytes = nelem*sizeof(float);

printf("Total number of bytes in each of the three arrays : %d \n",bytes);

timer total_time;



int ntiled=nelem/numGPUs;

int ntiled2=ntiled/nstreams_per_device;



timer gpu_malloc_timer;

MY_SAFE_CALL(cudaMallocHost(&a, bytes));

MY_SAFE_CALL(cudaMallocHost(&b, bytes));

MY_SAFE_CALL(cudaMallocHost(&c, bytes));

float total_gpu_malloc=gpu_malloc_timer.milliseconds_elapsed();



for(int i = 0; i < numGPUs; i++){

MY_SAFE_CALL( cudaSetDevice(i) );

MY_SAFE_CALL(cudaMalloc(&plan[i].d_a, ntiled*sizeof(float)));

MY_SAFE_CALL(cudaMalloc(&plan[i].d_b, ntiled*sizeof(float)));

MY_SAFE_CALL(cudaMalloc(&plan[i].d_c, ntiled*sizeof(float)));



}



/* Initialize the vectors. */



for(n = 0; n < nelem; n++)

{

a[n] = rand() / (float)RAND_MAX;

b[n] = rand() / (float)RAND_MAX;

}

printf("init arrays \n");

/* Get the device pointers for the pinned CPU memory mapped into the GPU

memory space. */

timer total_gpu_timer;



/* Call the GPU kernel using the device pointers for the mapped memory. */

for(int i = 0; i < numGPUs; i++){

for (int j=0;j<nstreams_per_device;j++){

//for each stream in the device we create a stream

// cudaStream_t * stream_ptr=&((plan[i].streams_array[i]));

MY_SAFE_CALL(cudaStreamCreate(&streams_array[i*nstreams_per_device+j]) );

}

}



dim3 block(256);

dim3 grid((unsigned int)ceil(ntiled2/(float)block.x));

timer kernel_timer;

for (int j=0;j<nstreams_per_device;j++){

for(int i = 0; i < numGPUs; i++){

MY_SAFE_CALL( cudaSetDevice(i) );

MY_SAFE_CALL(cudaMemcpyAsync( plan[i].d_a+j*ntiled2,a+i*ntiled+j*ntiled2 , ntiled2*sizeof(float), cudaMemcpyHostToDevice,streams_array[i*nstreams_per_device+j] ));

}

}

for (int j=0;j<nstreams_per_device;j++){

for(int i = 0; i < numGPUs; i++){

// myCudaGetLastError();

MY_SAFE_CALL(cudaMemcpyAsync( plan[i].d_b+j*ntiled2,b+i*ntiled+j*ntiled2 , ntiled2*sizeof(float), cudaMemcpyHostToDevice,streams_array[i*nstreams_per_device+j] ));

}

}



for (int j=0;j<nstreams_per_device;j++){

for(int i = 0; i < numGPUs; i++){

MY_SAFE_CALL( cudaSetDevice(i) );

vectorAddGPU<<<grid, block>>>(plan[i].d_a+j*ntiled2,plan[i].d_b+j*ntiled2,plan[i].d_c+j*ntiled2, ntiled2);

}

}

// myCudaGetLastError();

for (int j=0;j<nstreams_per_device;j++){

for(int i = 0; i < numGPUs; i++){

MY_SAFE_CALL( cudaSetDevice(i) );

MY_SAFE_CALL(cudaMemcpyAsync( c+i*ntiled+j*ntiled2, plan[i].d_c+j*ntiled2, ntiled2*sizeof(float), cudaMemcpyDeviceToHost,streams_array[i*nstreams_per_device+j] ));

}

}

float kernel_time=kernel_timer.milliseconds_elapsed();





// /* Compare the results */

// for ( int i=0; i<numGPUs; i++ )

// {

// MY_SAFE_CALL( cudaSetDevice(i) );

// cudaDeviceSynchronize();

// // cudaStreamSynchronize(streams_array[i]);

// }



#ifdef DEBUG

printf("Checking the results...\n");

errorNorm = 0.f;

refNorm = 0.f;

for(n = 0; n < nelem; n++)

{

ref = a[n] + b[n];

diff = c[n] - ref;

errorNorm += diff*diff;

refNorm += ref*ref;

}

errorNorm = (float)sqrt((double)errorNorm);

refNorm = (float)sqrt((double)refNorm);

#endif

/* Memory clean up */





timer gpu_free_timer;

MY_SAFE_CALL(cudaFreeHost(a));

MY_SAFE_CALL(cudaFreeHost(b));

MY_SAFE_CALL(cudaFreeHost(c));

float total_gpu_free=gpu_free_timer.milliseconds_elapsed();

float total_gpu_time=total_gpu_timer.milliseconds_elapsed();







float total_time_ms=total_time.milliseconds_elapsed();

printf("total time elapsed was %f \n",total_time_ms);

printf("total mallocGPU time elapsed was %f \n",total_gpu_malloc);

printf("total kernel time elapsed was %f \n",kernel_time);

printf("total gpu free time elapsed was %f \n",total_gpu_free);

printf("total gpu time elapsed was %f \n",total_gpu_time+kernel_time+total_gpu_free);



#ifdef DEBUG

printf("%s\n", (errorNorm/refNorm < 1.e-6f) ? "PASSED" : "FAILED");

#endif





}




You can substitute my_safe_call by:



#define my_safe_call(x) x



I have red:

http://www.pgroup.com/lit/articles/insider/v3n1a4.htm



as well as the webinar but nothing seems to work.



Any help would be appreciated.



Apostolis

#1
Posted 04/23/2012 11:05 AM   
Hi,
What is clearly missing here is to define the stream to attach your kernel invocation to. The lack of stream in you kernel call parameters means that they are in stream #0, which is fully synchronous. Change this to put "vectorAddGPU<<<grid, block, stream[i]>>>(plan[i].d_a+j*ntiled2,plan[i]" with one single stream per device.
See [url="http://developer.download.nvidia.com/CUDA/training/StreamsAndConcurrencyWebinar.pdf"]this[/url] and [url="http://developer.download.nvidia.com/CUDA/training/StreamsAndConcurrency_Jan_2012.mp4"]this[/url] for more details.
Hi,

What is clearly missing here is to define the stream to attach your kernel invocation to. The lack of stream in you kernel call parameters means that they are in stream #0, which is fully synchronous. Change this to put "vectorAddGPU<<<grid, block, stream[i]>>>(plan[i].d_a+j*ntiled2,plan[i]" with one single stream per device.

See this and this for more details.

#2
Posted 04/23/2012 11:40 AM   
If you do not use streams all the calls are on the so called default streams and they are execute in serial order not concurrently.

[code]


cudaStream_t stream[nstr]; // defien stremas
for (int is = 0; is < nstr; is++)
{
// cuda set device and allocations, stream creations
cudaMalloc(&dev_overlap[is],sizeof(int));

cudaStreamCreate(&stream[is]);
}

// execution on streams


for(int ist=0;ist<nstr;ist++)
{
jxyz[ist].x=jump*(2.0*genrand64_real2()-1.0);

jxyz[ist].y=jump*(2.0*genrand64_real2()-1.0);

jxyz[ist].z=jump*(2.0*genrand64_real2()-1.0);

atom_i[ist]=round((Np-1)*genrand64_real2());

rnd[ist]=genrand64_real2();

newMCenergyarray<<<grid,blocks,0,stream[ist]>>>(d....);
}

[/code]

A call without the stream (newMCenergyarray<<<grid,blocks>>>(d....); ) will block all streams.
If you do not use streams all the calls are on the so called default streams and they are execute in serial order not concurrently.









cudaStream_t stream[nstr]; // defien stremas

for (int is = 0; is < nstr; is++)

{

// cuda set device and allocations, stream creations

cudaMalloc(&dev_overlap[is],sizeof(int));



cudaStreamCreate(&stream[is]);

}



// execution on streams





for(int ist=0;ist<nstr;ist++)

{

jxyz[ist].x=jump*(2.0*genrand64_real2()-1.0);



jxyz[ist].y=jump*(2.0*genrand64_real2()-1.0);



jxyz[ist].z=jump*(2.0*genrand64_real2()-1.0);



atom_i[ist]=round((Np-1)*genrand64_real2());



rnd[ist]=genrand64_real2();



newMCenergyarray<<<grid,blocks,0,stream[ist]>>>(d....);

}






A call without the stream (newMCenergyarray<<<grid,blocks>>>(d....); ) will block all streams.

#3
Posted 04/23/2012 12:45 PM   
That was a minor typo from last night.
I corrected it and still no concurrency.
Here is the updated code:
[code]
#include <stdio.h>
#include <stdlib.h>
// #include <cutil_inline.h>
#include <cuda.h>
#include <my_cuda_lib/my_lib.cu>
#include <my_cuda_lib/timer.h>
#include "helpers.cu"
// #define DEBUG

__global__ void vectorAddGPU(float *a, float *b, float *c, int N)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx < N)
c[idx] = a[idx] + b[idx];
}

// Allocate generic memory with malloc() and pin it laster instead of using cudaHostAlloc()


int main(int argc, char *argv[])
{
int nstreams_per_device=atoi(argv[2]);
int n, nelem;
int numGPUs;
cudaGetDeviceCount(&numGPUs);
printf("CUDA-capable device count: %i\n", numGPUs);
cudaStream_t streams_array[numGPUs*nstreams_per_device];
TGPUplan * plan;
plan=(TGPUplan *)malloc(numGPUs*sizeof(TGPUplan));
printf("plans created successfully \n");

unsigned int flags;
size_t bytes;
float *a, *b, *c; // Pinned memory allocated on the CPU
// Device pointers for mapped memory
#ifdef DEBUG
float errorNorm, refNorm, ref, diff;
#endif

/* Allocate mapped CPU memory. */

nelem = atoi(argv[1])*1048576/4;
printf("Total number of elements in each of the three arrays : %d \n",nelem);
bytes = nelem*sizeof(float);
printf("Total number of bytes in each of the three arrays : %d \n",bytes);
timer total_time;

int ntiled=nelem/numGPUs;
int ntiled2=ntiled/nstreams_per_device;

timer gpu_malloc_timer;
MY_SAFE_CALL(cudaMallocHost(&a, bytes));
MY_SAFE_CALL(cudaMallocHost(&b, bytes));
MY_SAFE_CALL(cudaMallocHost(&c, bytes));
float total_gpu_malloc=gpu_malloc_timer.milliseconds_elapsed();

for(int i = 0; i < numGPUs; i++){
MY_SAFE_CALL( cudaSetDevice(i) );
MY_SAFE_CALL(cudaMalloc(&plan[i].d_a, ntiled*sizeof(float)));
MY_SAFE_CALL(cudaMalloc(&plan[i].d_b, ntiled*sizeof(float)));
MY_SAFE_CALL(cudaMalloc(&plan[i].d_c, ntiled*sizeof(float)));

}

/* Initialize the vectors. */

for(n = 0; n < nelem; n++)
{
a[n] = rand() / (float)RAND_MAX;
b[n] = rand() / (float)RAND_MAX;
}
printf("init arrays \n");
/* Get the device pointers for the pinned CPU memory mapped into the GPU
memory space. */
timer total_gpu_timer;

/* Call the GPU kernel using the device pointers for the mapped memory. */
for(int i = 0; i < numGPUs; i++){
for (int j=0;j<nstreams_per_device;j++){
//for each stream in the device we create a stream
cudaStream_t * stream_ptr=&((streams_array[i*nstreams_per_device+j]));
MY_SAFE_CALL(cudaStreamCreate(stream_ptr ));
}
}

dim3 block(256);
dim3 grid((unsigned int)ceil(ntiled2/(float)block.x));
timer kernel_timer;
for (int j=0;j<nstreams_per_device;j++){
for(int i = 0; i < numGPUs; i++){
MY_SAFE_CALL( cudaSetDevice(i) );
MY_SAFE_CALL(cudaMemcpyAsync( plan[i].d_a+j*ntiled2,a+i*ntiled+j*ntiled2 , ntiled2*sizeof(float), cudaMemcpyHostToDevice,streams_array[i*nstreams_per_device+j] ));


MY_SAFE_CALL(cudaMemcpyAsync( plan[i].d_b+j*ntiled2,b+i*ntiled+j*ntiled2 , ntiled2*sizeof(float), cudaMemcpyHostToDevice,streams_array[i*nstreams_per_device+j] ));


vectorAddGPU<<<grid, block,0,streams_array[i*nstreams_per_device+j]>>>(plan[i].d_a+j*ntiled2,plan[i].d_b+j*ntiled2,plan[i].d_c+j*ntiled2, ntiled2);




MY_SAFE_CALL(cudaMemcpyAsync( c+i*ntiled+j*ntiled2, plan[i].d_c+j*ntiled2, ntiled2*sizeof(float), cudaMemcpyDeviceToHost,streams_array[i*nstreams_per_device+j] ));
}
}
float kernel_time=kernel_timer.milliseconds_elapsed();


/* Compare the results */
for ( int i=0; i<numGPUs; i++ )
{
MY_SAFE_CALL( cudaSetDevice(i) );
cudaDeviceSynchronize();
// // cudaStreamSynchronize(streams_array[i]);
}

#ifdef DEBUG
printf("Checking the results...\n");
errorNorm = 0.f;
refNorm = 0.f;
for(n = 0; n < nelem; n++)
{
ref = a[n] + b[n];
diff = c[n] - ref;
errorNorm += diff*diff;
refNorm += ref*ref;
}
errorNorm = (float)sqrt((double)errorNorm);
refNorm = (float)sqrt((double)refNorm);
#endif
/* Memory clean up */


timer gpu_free_timer;
MY_SAFE_CALL(cudaFreeHost(a));
MY_SAFE_CALL(cudaFreeHost(b));
MY_SAFE_CALL(cudaFreeHost(c));
float total_gpu_free=gpu_free_timer.milliseconds_elapsed();
float total_gpu_time=total_gpu_timer.milliseconds_elapsed();



float total_time_ms=total_time.milliseconds_elapsed();
printf("total time elapsed was %f \n",total_time_ms);
printf("total mallocGPU time elapsed was %f \n",total_gpu_malloc);
printf("total kernel time elapsed was %f \n",kernel_time);
printf("total gpu free time elapsed was %f \n",total_gpu_free);
printf("total gpu time elapsed was %f \n",total_gpu_time+kernel_time+total_gpu_free);

#ifdef DEBUG
printf("%s\n", (errorNorm/refNorm < 1.e-6f) ? "PASSED" : "FAILED");
#endif


}
[/code]

Any further ideas?

Thanks again,

Apostolis
That was a minor typo from last night.

I corrected it and still no concurrency.

Here is the updated code:



#include <stdio.h>

#include <stdlib.h>

// #include <cutil_inline.h>

#include <cuda.h>

#include <my_cuda_lib/my_lib.cu>

#include <my_cuda_lib/timer.h>

#include "helpers.cu"

// #define DEBUG



__global__ void vectorAddGPU(float *a, float *b, float *c, int N)

{

int idx = blockIdx.x*blockDim.x + threadIdx.x;

if (idx < N)

c[idx] = a[idx] + b[idx];

}



// Allocate generic memory with malloc() and pin it laster instead of using cudaHostAlloc()





int main(int argc, char *argv[])

{

int nstreams_per_device=atoi(argv[2]);

int n, nelem;

int numGPUs;

cudaGetDeviceCount(&numGPUs);

printf("CUDA-capable device count: %i\n", numGPUs);

cudaStream_t streams_array[numGPUs*nstreams_per_device];

TGPUplan * plan;

plan=(TGPUplan *)malloc(numGPUs*sizeof(TGPUplan));

printf("plans created successfully \n");



unsigned int flags;

size_t bytes;

float *a, *b, *c; // Pinned memory allocated on the CPU

// Device pointers for mapped memory

#ifdef DEBUG

float errorNorm, refNorm, ref, diff;

#endif



/* Allocate mapped CPU memory. */



nelem = atoi(argv[1])*1048576/4;

printf("Total number of elements in each of the three arrays : %d \n",nelem);

bytes = nelem*sizeof(float);

printf("Total number of bytes in each of the three arrays : %d \n",bytes);

timer total_time;



int ntiled=nelem/numGPUs;

int ntiled2=ntiled/nstreams_per_device;



timer gpu_malloc_timer;

MY_SAFE_CALL(cudaMallocHost(&a, bytes));

MY_SAFE_CALL(cudaMallocHost(&b, bytes));

MY_SAFE_CALL(cudaMallocHost(&c, bytes));

float total_gpu_malloc=gpu_malloc_timer.milliseconds_elapsed();



for(int i = 0; i < numGPUs; i++){

MY_SAFE_CALL( cudaSetDevice(i) );

MY_SAFE_CALL(cudaMalloc(&plan[i].d_a, ntiled*sizeof(float)));

MY_SAFE_CALL(cudaMalloc(&plan[i].d_b, ntiled*sizeof(float)));

MY_SAFE_CALL(cudaMalloc(&plan[i].d_c, ntiled*sizeof(float)));



}



/* Initialize the vectors. */



for(n = 0; n < nelem; n++)

{

a[n] = rand() / (float)RAND_MAX;

b[n] = rand() / (float)RAND_MAX;

}

printf("init arrays \n");

/* Get the device pointers for the pinned CPU memory mapped into the GPU

memory space. */

timer total_gpu_timer;



/* Call the GPU kernel using the device pointers for the mapped memory. */

for(int i = 0; i < numGPUs; i++){

for (int j=0;j<nstreams_per_device;j++){

//for each stream in the device we create a stream

cudaStream_t * stream_ptr=&((streams_array[i*nstreams_per_device+j]));

MY_SAFE_CALL(cudaStreamCreate(stream_ptr ));

}

}



dim3 block(256);

dim3 grid((unsigned int)ceil(ntiled2/(float)block.x));

timer kernel_timer;

for (int j=0;j<nstreams_per_device;j++){

for(int i = 0; i < numGPUs; i++){

MY_SAFE_CALL( cudaSetDevice(i) );

MY_SAFE_CALL(cudaMemcpyAsync( plan[i].d_a+j*ntiled2,a+i*ntiled+j*ntiled2 , ntiled2*sizeof(float), cudaMemcpyHostToDevice,streams_array[i*nstreams_per_device+j] ));





MY_SAFE_CALL(cudaMemcpyAsync( plan[i].d_b+j*ntiled2,b+i*ntiled+j*ntiled2 , ntiled2*sizeof(float), cudaMemcpyHostToDevice,streams_array[i*nstreams_per_device+j] ));





vectorAddGPU<<<grid, block,0,streams_array[i*nstreams_per_device+j]>>>(plan[i].d_a+j*ntiled2,plan[i].d_b+j*ntiled2,plan[i].d_c+j*ntiled2, ntiled2);









MY_SAFE_CALL(cudaMemcpyAsync( c+i*ntiled+j*ntiled2, plan[i].d_c+j*ntiled2, ntiled2*sizeof(float), cudaMemcpyDeviceToHost,streams_array[i*nstreams_per_device+j] ));

}

}

float kernel_time=kernel_timer.milliseconds_elapsed();





/* Compare the results */

for ( int i=0; i<numGPUs; i++ )

{

MY_SAFE_CALL( cudaSetDevice(i) );

cudaDeviceSynchronize();

// // cudaStreamSynchronize(streams_array[i]);

}



#ifdef DEBUG

printf("Checking the results...\n");

errorNorm = 0.f;

refNorm = 0.f;

for(n = 0; n < nelem; n++)

{

ref = a[n] + b[n];

diff = c[n] - ref;

errorNorm += diff*diff;

refNorm += ref*ref;

}

errorNorm = (float)sqrt((double)errorNorm);

refNorm = (float)sqrt((double)refNorm);

#endif

/* Memory clean up */





timer gpu_free_timer;

MY_SAFE_CALL(cudaFreeHost(a));

MY_SAFE_CALL(cudaFreeHost(b));

MY_SAFE_CALL(cudaFreeHost(c));

float total_gpu_free=gpu_free_timer.milliseconds_elapsed();

float total_gpu_time=total_gpu_timer.milliseconds_elapsed();







float total_time_ms=total_time.milliseconds_elapsed();

printf("total time elapsed was %f \n",total_time_ms);

printf("total mallocGPU time elapsed was %f \n",total_gpu_malloc);

printf("total kernel time elapsed was %f \n",kernel_time);

printf("total gpu free time elapsed was %f \n",total_gpu_free);

printf("total gpu time elapsed was %f \n",total_gpu_time+kernel_time+total_gpu_free);



#ifdef DEBUG

printf("%s\n", (errorNorm/refNorm < 1.e-6f) ? "PASSED" : "FAILED");

#endif





}




Any further ideas?



Thanks again,



Apostolis

#4
Posted 04/23/2012 01:02 PM   
It looks good. I suggest to start with simpler codes to see that you got the streams working.


The cudaDeviceSynchronize(); makes no sense in the loop. Shouldn't be outside?

[code]
/* Compare the results */
for ( int i=0; i<numGPUs; i++ )
{
MY_SAFE_CALL( cudaSetDevice(i) );
cudaDeviceSynchronize();
// // cudaStreamSynchronize(streams_array[i]);
}
[/code]
It looks good. I suggest to start with simpler codes to see that you got the streams working.





The cudaDeviceSynchronize(); makes no sense in the loop. Shouldn't be outside?





/* Compare the results */

for ( int i=0; i<numGPUs; i++ )

{

MY_SAFE_CALL( cudaSetDevice(i) );

cudaDeviceSynchronize();

// // cudaStreamSynchronize(streams_array[i]);

}

#5
Posted 04/23/2012 01:12 PM   
Just another little fix.
The code now works correctly but still no concurrency:
[code]
__global__ void vectorAddGPU(float *a, float *b, float *c, int N)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx < N)
c[idx] = a[idx] + b[idx];
}

// Allocate generic memory with malloc() and pin it laster instead of using cudaHostAlloc()


int main(int argc, char *argv[])
{
int nstreams_per_device=atoi(argv[2]);
int n, nelem;
int numGPUs;
cudaGetDeviceCount(&numGPUs);
printf("CUDA-capable device count: %i\n", numGPUs);
cudaStream_t streams_array[numGPUs*nstreams_per_device];
TGPUplan * plan;
plan=(TGPUplan *)malloc(numGPUs*sizeof(TGPUplan));
printf("plans created successfully \n");

unsigned int flags;
size_t bytes;
float *a, *b, *c; // Pinned memory allocated on the CPU
// Device pointers for mapped memory
#ifdef DEBUG
float errorNorm, refNorm, ref, diff;
#endif

/* Allocate mapped CPU memory. */

nelem = atoi(argv[1])*1048576/4;
printf("Total number of elements in each of the three arrays : %d \n",nelem);
bytes = nelem*sizeof(float);
printf("Total number of bytes in each of the three arrays : %d \n",bytes);
timer total_time;

int ntiled=nelem/numGPUs;
int ntiled2=ntiled/nstreams_per_device;

timer gpu_malloc_timer;
MY_SAFE_CALL(cudaMallocHost(&a, bytes));
MY_SAFE_CALL(cudaMallocHost(&b, bytes));
MY_SAFE_CALL(cudaMallocHost(&c, bytes));
float total_gpu_malloc=gpu_malloc_timer.milliseconds_elapsed();

for(int i = 0; i < numGPUs; i++){
MY_SAFE_CALL( cudaSetDevice(i) );
MY_SAFE_CALL(cudaMalloc(&plan[i].d_a, ntiled*sizeof(float)));
MY_SAFE_CALL(cudaMalloc(&plan[i].d_b, ntiled*sizeof(float)));
MY_SAFE_CALL(cudaMalloc(&plan[i].d_c, ntiled*sizeof(float)));

}

/* Initialize the vectors. */

for(n = 0; n < nelem; n++)
{
a[n] = rand() / (float)RAND_MAX;
b[n] = rand() / (float)RAND_MAX;
}
printf("init arrays \n");
/* Get the device pointers for the pinned CPU memory mapped into the GPU
memory space. */
timer total_gpu_timer;

/* Call the GPU kernel using the device pointers for the mapped memory. */
for(int i = 0; i < numGPUs; i++){
MY_SAFE_CALL( cudaSetDevice(i) );
for (int j=0;j<nstreams_per_device;j++){
//for each stream in the device we create a stream
cudaStream_t * stream_ptr=&((streams_array[i*nstreams_per_device+j]));
MY_SAFE_CALL(cudaStreamCreate(stream_ptr ));
}
}


dim3 block(256);
dim3 grid((unsigned int)ceil(ntiled2/(float)block.x));
timer kernel_timer;

for(int i = 0; i < numGPUs; i++){
MY_SAFE_CALL( cudaSetDevice(i) );
for (int j=0;j<nstreams_per_device;j++){


MY_SAFE_CALL(cudaMemcpyAsync( plan[i].d_a+j*ntiled2,a+i*ntiled+j*ntiled2 , ntiled2*sizeof(float), cudaMemcpyHostToDevice,streams_array[i*nstreams_per_device+j]
));


MY_SAFE_CALL(cudaMemcpyAsync( plan[i].d_b+j*ntiled2,b+i*ntiled+j*ntiled2 , ntiled2*sizeof(float), cudaMemcpyHostToDevice,streams_array[i*nstreams_per_device+j]
));


vectorAddGPU<<<grid, block,0,streams_array[i*nstreams_per_device+j]>>>(plan[i].d_a+j*ntiled2,plan[i].d_b+j*ntiled2,plan[i].d_c+j*ntiled2, ntiled2);




MY_SAFE_CALL(cudaMemcpyAsync( c+i*ntiled+j*ntiled2, plan[i].d_c+j*ntiled2, ntiled2*sizeof(float), cudaMemcpyDeviceToHost,streams_array[i*nstreams_per_device+j]
));
}
}
float kernel_time=kernel_timer.milliseconds_elapsed();


/* Compare the results */
for ( int i=0; i<numGPUs; i++ )
{
MY_SAFE_CALL( cudaSetDevice(i) );
cudaDeviceSynchronize();
// // cudaStreamSynchronize(streams_array[i]);
}

#ifdef DEBUG
printf("Checking the results...\n");
errorNorm = 0.f;
refNorm = 0.f;
for(n = 0; n < nelem; n++)
{
ref = a[n] + b[n];
diff = c[n] - ref;
errorNorm += diff*diff;
refNorm += ref*ref;
}
errorNorm = (float)sqrt((double)errorNorm);
refNorm = (float)sqrt((double)refNorm);
#endif
/* Memory clean up */


timer gpu_free_timer;
MY_SAFE_CALL(cudaFreeHost(a));
MY_SAFE_CALL(cudaFreeHost(b));
MY_SAFE_CALL(cudaFreeHost(c));
float total_gpu_free=gpu_free_timer.milliseconds_elapsed();
float total_gpu_time=total_gpu_timer.milliseconds_elapsed();



float total_time_ms=total_time.milliseconds_elapsed();
printf("total time elapsed was %f \n",total_time_ms);
printf("total mallocGPU time elapsed was %f \n",total_gpu_malloc);
printf("total kernel time elapsed was %f \n",kernel_time);
printf("total gpu free time elapsed was %f \n",total_gpu_free);
printf("total gpu time elapsed was %f \n",total_gpu_time+kernel_time+total_gpu_free);

#ifdef DEBUG
printf("%s\n", (errorNorm/refNorm < 1.e-6f) ? "PASSED" : "FAILED");
#endif


}
[/code]
This is what I get when I run the profiler:
Just another little fix.

The code now works correctly but still no concurrency:



__global__ void vectorAddGPU(float *a, float *b, float *c, int N)

{

int idx = blockIdx.x*blockDim.x + threadIdx.x;

if (idx < N)

c[idx] = a[idx] + b[idx];

}



// Allocate generic memory with malloc() and pin it laster instead of using cudaHostAlloc()





int main(int argc, char *argv[])

{

int nstreams_per_device=atoi(argv[2]);

int n, nelem;

int numGPUs;

cudaGetDeviceCount(&numGPUs);

printf("CUDA-capable device count: %i\n", numGPUs);

cudaStream_t streams_array[numGPUs*nstreams_per_device];

TGPUplan * plan;

plan=(TGPUplan *)malloc(numGPUs*sizeof(TGPUplan));

printf("plans created successfully \n");



unsigned int flags;

size_t bytes;

float *a, *b, *c; // Pinned memory allocated on the CPU

// Device pointers for mapped memory

#ifdef DEBUG

float errorNorm, refNorm, ref, diff;

#endif



/* Allocate mapped CPU memory. */



nelem = atoi(argv[1])*1048576/4;

printf("Total number of elements in each of the three arrays : %d \n",nelem);

bytes = nelem*sizeof(float);

printf("Total number of bytes in each of the three arrays : %d \n",bytes);

timer total_time;



int ntiled=nelem/numGPUs;

int ntiled2=ntiled/nstreams_per_device;



timer gpu_malloc_timer;

MY_SAFE_CALL(cudaMallocHost(&a, bytes));

MY_SAFE_CALL(cudaMallocHost(&b, bytes));

MY_SAFE_CALL(cudaMallocHost(&c, bytes));

float total_gpu_malloc=gpu_malloc_timer.milliseconds_elapsed();



for(int i = 0; i < numGPUs; i++){

MY_SAFE_CALL( cudaSetDevice(i) );

MY_SAFE_CALL(cudaMalloc(&plan[i].d_a, ntiled*sizeof(float)));

MY_SAFE_CALL(cudaMalloc(&plan[i].d_b, ntiled*sizeof(float)));

MY_SAFE_CALL(cudaMalloc(&plan[i].d_c, ntiled*sizeof(float)));



}



/* Initialize the vectors. */



for(n = 0; n < nelem; n++)

{

a[n] = rand() / (float)RAND_MAX;

b[n] = rand() / (float)RAND_MAX;

}

printf("init arrays \n");

/* Get the device pointers for the pinned CPU memory mapped into the GPU

memory space. */

timer total_gpu_timer;



/* Call the GPU kernel using the device pointers for the mapped memory. */

for(int i = 0; i < numGPUs; i++){

MY_SAFE_CALL( cudaSetDevice(i) );

for (int j=0;j<nstreams_per_device;j++){

//for each stream in the device we create a stream

cudaStream_t * stream_ptr=&((streams_array[i*nstreams_per_device+j]));

MY_SAFE_CALL(cudaStreamCreate(stream_ptr ));

}

}





dim3 block(256);

dim3 grid((unsigned int)ceil(ntiled2/(float)block.x));

timer kernel_timer;



for(int i = 0; i < numGPUs; i++){

MY_SAFE_CALL( cudaSetDevice(i) );

for (int j=0;j<nstreams_per_device;j++){





MY_SAFE_CALL(cudaMemcpyAsync( plan[i].d_a+j*ntiled2,a+i*ntiled+j*ntiled2 , ntiled2*sizeof(float), cudaMemcpyHostToDevice,streams_array[i*nstreams_per_device+j]

));





MY_SAFE_CALL(cudaMemcpyAsync( plan[i].d_b+j*ntiled2,b+i*ntiled+j*ntiled2 , ntiled2*sizeof(float), cudaMemcpyHostToDevice,streams_array[i*nstreams_per_device+j]

));





vectorAddGPU<<<grid, block,0,streams_array[i*nstreams_per_device+j]>>>(plan[i].d_a+j*ntiled2,plan[i].d_b+j*ntiled2,plan[i].d_c+j*ntiled2, ntiled2);









MY_SAFE_CALL(cudaMemcpyAsync( c+i*ntiled+j*ntiled2, plan[i].d_c+j*ntiled2, ntiled2*sizeof(float), cudaMemcpyDeviceToHost,streams_array[i*nstreams_per_device+j]

));

}

}

float kernel_time=kernel_timer.milliseconds_elapsed();





/* Compare the results */

for ( int i=0; i<numGPUs; i++ )

{

MY_SAFE_CALL( cudaSetDevice(i) );

cudaDeviceSynchronize();

// // cudaStreamSynchronize(streams_array[i]);

}



#ifdef DEBUG

printf("Checking the results...\n");

errorNorm = 0.f;

refNorm = 0.f;

for(n = 0; n < nelem; n++)

{

ref = a[n] + b[n];

diff = c[n] - ref;

errorNorm += diff*diff;

refNorm += ref*ref;

}

errorNorm = (float)sqrt((double)errorNorm);

refNorm = (float)sqrt((double)refNorm);

#endif

/* Memory clean up */





timer gpu_free_timer;

MY_SAFE_CALL(cudaFreeHost(a));

MY_SAFE_CALL(cudaFreeHost(b));

MY_SAFE_CALL(cudaFreeHost(c));

float total_gpu_free=gpu_free_timer.milliseconds_elapsed();

float total_gpu_time=total_gpu_timer.milliseconds_elapsed();







float total_time_ms=total_time.milliseconds_elapsed();

printf("total time elapsed was %f \n",total_time_ms);

printf("total mallocGPU time elapsed was %f \n",total_gpu_malloc);

printf("total kernel time elapsed was %f \n",kernel_time);

printf("total gpu free time elapsed was %f \n",total_gpu_free);

printf("total gpu time elapsed was %f \n",total_gpu_time+kernel_time+total_gpu_free);



#ifdef DEBUG

printf("%s\n", (errorNorm/refNorm < 1.e-6f) ? "PASSED" : "FAILED");

#endif





}


This is what I get when I run the profiler:
Attachments

no_conc.png

#6
Posted 04/23/2012 01:23 PM   
Also when there are two memcpy's in the same direction I get a lot less bandwith than expected.
Bandwith of H2D to the first GPU is 5GB/s and when there is a H2D transfer to the second GPU at the same time the bandwidth to the second GPU is 2.2 GB/s.
When there is only a H2D transfer to the second GPU the bandwidth is still 5GB/s.

Also DeviceSynchronize() is in a loop because I have to synchronize 2 GPUs not just one.
Am I doing something wrong here?
Also when there are two memcpy's in the same direction I get a lot less bandwith than expected.

Bandwith of H2D to the first GPU is 5GB/s and when there is a H2D transfer to the second GPU at the same time the bandwidth to the second GPU is 2.2 GB/s.

When there is only a H2D transfer to the second GPU the bandwidth is still 5GB/s.



Also DeviceSynchronize() is in a loop because I have to synchronize 2 GPUs not just one.

Am I doing something wrong here?

#7
Posted 04/23/2012 01:33 PM   
In order to figure out what is going wrong I created a small test case with a single GPU by modifying the SDK example.
I can get 2-way concurrency(kernel+d2h)but not three-way(h2d+kernel+d2h).
I am attaching the code that produces 2-way concurrency.
If I use MemcpyAsync for the H2D transfers the results are correct but the whole process get serialized.
The same thing happened In the CUBLAS code from the webinar,I don't know why.
Any ideas?
I am terribly buffled by this.please note that in the multi-GPU case it isn't practical to use 2-way concurrency since I have to wait for the H2D transfer to complete before I change context.
Thanks in advance,

Apostolis

[code]
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*
*
* This sample illustrates the usage of CUDA streams for overlapping
* kernel execution with device/host memcopies. The kernel is used to
* initialize an array to a specific value, after which the array is
* copied to the host (CPU) memory. To increase performance, multiple
* kernel/memcopy pairs are launched asynchronously, each pair in its
* own stream. Devices with Compute Capability 1.1 can overlap a kernel
* and a memcopy as long as they are issued in different streams. Kernels
* are serialized. Thus, if n pairs are launched, streamed approach
* can reduce the memcopy cost to the (1/n)th of a single copy of the entire
* data set.
*
* Additionally, this sample uses CUDA events to measure elapsed time for
* CUDA calls. Events are a part of CUDA API and provide a system independent
* way to measure execution times on CUDA devices with approximately 0.5
* microsecond precision.
*
* Elapsed times are averaged over nreps repetitions (10 by default).
*
*/

const char *sSDKsample = "simpleStreams";

const char *sEventSyncMethod[] =
{
"cudaEventDefault",
"cudaEventBlockingSync",
"cudaEventDisableTiming",
NULL
};

const char *sDeviceSyncMethod[] =
{
"cudaDeviceScheduleAuto",
"cudaDeviceScheduleSpin",
"cudaDeviceScheduleYield",
"INVALID",
"cudaDeviceScheduleBlockingSync",
NULL
};

// Include headers
#include <stdio.h>

// CUDA utilities and system includes
#include <cuda_runtime.h>

// Shared Library Test Functions
#include <sdkHelper.h> // helper for shared that are common to CUDA SDK samples
#include <shrUtils.h>
#include <shrQATest.h>
#include <my_cuda_lib/my_lib.cu>

#ifndef WIN32
#include <sys/mman.h> // for mmap() / munmap()
#endif

////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions

// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)

inline void __checkCudaErrors(cudaError err, const char *file, const int line )
{
if(cudaSuccess != err)
{
fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",file, line, (int)err, cudaGetErrorString( err ) );
exit(-1);
}
}

// This will output the proper error string when calling cudaGetLastError
#define getLastCudaError(msg) __getLastCudaError (msg, __FILE__, __LINE__)

inline void __getLastCudaError(const char *errorMessage, const char *file, const int line )
{
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err)
{
fprintf(stderr, "%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n",
file, line, errorMessage, (int)err, cudaGetErrorString( err ) );
exit(-1);
}
}

// General GPU Device CUDA Initialization
int gpuDeviceInit(int devID)
{
int deviceCount;
checkCudaErrors(cudaGetDeviceCount(&deviceCount));

if (deviceCount == 0)
{
fprintf(stderr, "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n");
exit(-1);
}

if (devID < 0)
devID = 0;

if (devID > deviceCount-1)
{
fprintf(stderr, "\n");
fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
fprintf(stderr, ">> gpuDeviceInit (-device=%d) is not a valid GPU device. <<\n", devID);
fprintf(stderr, "\n");
return -devID;
}

cudaDeviceProp deviceProp;
checkCudaErrors( cudaGetDeviceProperties(&deviceProp, devID) );

if (deviceProp.major < 1)
{
fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n");
exit(-1);
}

checkCudaErrors( cudaSetDevice(devID) );
printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, deviceProp.name);

return devID;
}

// This function returns the best GPU (with maximum GFLOPS)
int gpuGetMaxGflopsDeviceId()
{
int current_device = 0, sm_per_multiproc = 0;
int max_compute_perf = 0, max_perf_device = 0;
int device_count = 0, best_SM_arch = 0;
cudaDeviceProp deviceProp;
cudaGetDeviceCount( &device_count );

// Find the best major SM Architecture GPU device
while (current_device < device_count)
{
cudaGetDeviceProperties( &deviceProp, current_device );
if (deviceProp.major > 0 && deviceProp.major < 9999)
{
best_SM_arch = MAX(best_SM_arch, deviceProp.major);
}
current_device++;
}

// Find the best CUDA capable GPU device
current_device = 0;
while( current_device < device_count )
{
cudaGetDeviceProperties( &deviceProp, current_device );
if (deviceProp.major == 9999 && deviceProp.minor == 9999)
{
sm_per_multiproc = 1;
}
else
{
sm_per_multiproc = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
}

int compute_perf = deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;

if( compute_perf > max_compute_perf )
{
// If we find GPU with SM major > 2, search only these
if ( best_SM_arch > 2 )
{
// If our device==dest_SM_arch, choose this, or else pass
if (deviceProp.major == best_SM_arch)
{
max_compute_perf = compute_perf;
max_perf_device = current_device;
}
}
else
{
max_compute_perf = compute_perf;
max_perf_device = current_device;
}
}
++current_device;
}
return max_perf_device;
}


// Initialization code to find the best CUDA Device
int findCudaDevice(int argc, const char **argv)
{
cudaDeviceProp deviceProp;
int devID = 0;
// If the command-line has a device number specified, use it
if (checkCmdLineFlag(argc, argv, "device"))
{
devID = getCmdLineArgumentInt(argc, argv, "device=");
if (devID < 0)
{
printf("Invalid command line parameter\n ");
exit(-1);
}
else
{
devID = gpuDeviceInit(devID);
if (devID < 0)
{
printf("exiting...\n");
shrQAFinishExit(argc, (const char **)argv, QA_FAILED);
exit(-1);
}
}
}
else
{
// Otherwise pick the device with highest Gflops/s
devID = gpuGetMaxGflopsDeviceId();
checkCudaErrors( cudaSetDevice( devID ) );
checkCudaErrors( cudaGetDeviceProperties(&deviceProp, devID) );
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
}
return devID;
}
// end of CUDA Helper Functions


// Macro to aligned up to the memory size in question
#define MEMORY_ALIGNMENT 4096
#define ALIGN_UP(x,size) ( ((size_t)x+(size-1))&(~(size-1)) )

__global__ void vectorAddGPU(int *a, int *b, int *c, int N,int nIterations)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
// for( int i=0;i<nIterations;i++){
if (idx < N){

c[idx] = a[idx] + b[idx];}
}
// }

int correct_data(int *a,int*b,int*c, const int n)
{
for(int i = 0; i < n; i++) {
if(c[i] != a[i]+b[i]) {
printf("%d: %d %d\n", i, a[i], c);
return 0;
}
}
return 1;
}

inline void
AllocateHostMemory(bool bPinGenericMemory, int **pp_a, int **ppAligned_a, int nbytes)
{
#if CUDART_VERSION >= 4000
if (bPinGenericMemory)
{
// allocate a generic page-aligned chunk of system memory
#ifdef WIN32
printf("> VirtualAlloc() allocating %4.2f Mbytes of (generic page-aligned system memory)\n", (float)nbytes/1048576.0f);
*pp_a = (int *) VirtualAlloc( NULL, (nbytes + MEMORY_ALIGNMENT), MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE );
#else
printf("> mmap() allocating %4.2f Mbytes (generic page-aligned system memory)\n", (float)nbytes/1048576.0f);
*pp_a = (int *) mmap ( NULL, (nbytes + MEMORY_ALIGNMENT), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0 );
#endif

*ppAligned_a = (int *)ALIGN_UP(*pp_a, MEMORY_ALIGNMENT);

printf("> cudaHostRegister() registering %4.2f Mbytes of generic allocated system memory\n", (float)nbytes/1048576.0f);
// pin allocate memory
checkCudaErrors( cudaHostRegister(*ppAligned_a, nbytes, cudaHostRegisterMapped) );
}
else
#endif
{
printf("> cudaMallocHost() allocating %4.2f Mbytes of system memory\n", (float)nbytes/1048576.0f);
// allocate host memory (pinned is required for achieve asynchronicity)
checkCudaErrors( cudaMallocHost((void**)pp_a, nbytes) );
*ppAligned_a = *pp_a;
}
}

inline void
FreeHostMemory(bool bPinGenericMemory, int **pp_a, int **ppAligned_a, int nbytes)
{
#if CUDART_VERSION >= 4000
// CUDA 4.0 support pinning of generic host memory
if (bPinGenericMemory)
{
// unpin and delete host memory
checkCudaErrors( cudaHostUnregister(*ppAligned_a) );
#ifdef WIN32
VirtualFree(*pp_a, 0, MEM_RELEASE);
#else
munmap(*pp_a, nbytes);
#endif
}
else
#endif
{
cudaFreeHost(*pp_a);
}
}

static char *sSyncMethod[] =
{
"0 (Automatic Blocking)",
"1 (Spin Blocking)",
"2 (Yield Blocking)",
"3 (Undefined Blocking Method)",
"4 (Blocking Sync Event) = low CPU utilization",
NULL
};

void printHelp()
{
printf("Usage: %s [options below]\n", sSDKsample);
printf("\t--sync_method=n for CPU/GPU synchronization\n");
printf("\t n=%s\n", sSyncMethod[0]);
printf("\t n=%s\n", sSyncMethod[1]);
printf("\t n=%s\n", sSyncMethod[2]);
printf("\t <Default> n=%s\n", sSyncMethod[4]);
printf("\t--use_generic_memory (default) use generic page-aligned for system memory\n");
printf("\t--use_cuda_malloc_host (optional) use cudaMallocHost to allocate system memory\n");
}

#if defined(__APPLE__) || defined(MACOSX)
#define DEFAULT_PINNED_GENERIC_MEMORY false
#else
#define DEFAULT_PINNED_GENERIC_MEMORY true
#endif

int main(int argc, char **argv)
{
int cuda_device = 0;
int nstreams = 4; // number of streams for CUDA calls
int nreps = 1; // number of times each experiment is repeated
int n = 16 * 1024 * 1024; // number of ints in the data set
int nbytes = n * sizeof(int); // number of data bytes
dim3 threads, blocks; // kernel launch configuration
float elapsed_time, time_memcpy, time_kernel; // timing variables
float scale_factor = 1.0f;

// allocate generic memory and pin it laster instead of using cudaHostAlloc()

bool bPinGenericMemory = DEFAULT_PINNED_GENERIC_MEMORY; // we want this to be the default behavior
int device_sync_method = cudaDeviceBlockingSync; // by default we use BlockingSync

int niterations; // number of iterations for the loop inside the kernel

shrQAStart(argc, argv);

printf("[ %s ]\n\n", sSDKsample);
if( checkCmdLineFlag( argc, (const char **)argv, "help") ) {
printHelp();
shrQAFinishExit(argc, (const char **)argv, QA_PASSED);
}
if( (device_sync_method = getCmdLineArgumentInt( argc, (const char **)argv, "sync_method" )) >= 0 ) {
if (device_sync_method == 0 || device_sync_method == 1 || device_sync_method == 2 || device_sync_method == 4) {
printf("Device synchronization method set to = %s\n", sSyncMethod[device_sync_method]);
printf("Setting reps to %d to demonstrate steady state\n",nreps);
nreps = 100;
} else {
printf("Invalid command line option sync_method=\"%d\"\n", device_sync_method);
shrQAFinishExit(argc, (const char **)argv, QA_FAILED);
}
} else {
printHelp();
shrQAFinishExit(argc, (const char **)argv, QA_PASSED);
}

if( checkCmdLineFlag( argc, (const char **)argv, "use_generic_memory") ) {
#if defined(__APPLE__) || defined(MACOSX)
bPinGenericMemory = false; // Generic Pinning of System Paged memory not currently supported on Mac OSX
#else
bPinGenericMemory = true;
#endif
}
if( checkCmdLineFlag( argc, (const char **)argv, "use_cuda_malloc_host") ) {
bPinGenericMemory = false;
}

printf("\n> ");
cuda_device = findCudaDevice(argc, (const char **)argv);

// check the compute capability of the device
int num_devices=0;
checkCudaErrors( cudaGetDeviceCount(&num_devices) );
if(0==num_devices)
{
printf("your system does not have a CUDA capable device, waiving test...\n");
shrQAFinishExit(argc, (const char **)argv, QA_PASSED);
}

// check if the command-line chosen device ID is within range, exit if not
if( cuda_device >= num_devices )
{
printf("cuda_device=%d is invalid, must choose device ID between 0 and %d\n", cuda_device, num_devices-1);
shrQAFinishExit(argc, (const char **)argv, QA_FAILED);
}

cudaSetDevice( cuda_device );

// Checking for compute capabilities
cudaDeviceProp deviceProp;
checkCudaErrors( cudaGetDeviceProperties(&deviceProp, cuda_device) );
if( (1 == deviceProp.major) && (deviceProp.minor < 1)) {
printf("%s does not have Compute Capability 1.1 or newer. Reducing workload.\n", deviceProp.name);
}

if(deviceProp.major >= 2) {
niterations = 100;
} else {
if(deviceProp.minor > 1) {
niterations = 5;
} else {
niterations = 1; // reduced workload for compute capability 1.0 and 1.1
}
}

// Check if GPU can map host memory (Generic Method), if not then we override bPinGenericMemory to be false
if (bPinGenericMemory) {
printf("Device: <%s> canMapHostMemory: %s\n", deviceProp.name, deviceProp.canMapHostMemory ? "Yes" : "No");
if (deviceProp.canMapHostMemory == 0) {
printf("Using cudaMallocHost, CUDA device does not support mapping of generic host memory\n");
bPinGenericMemory = false;
}
}

// Anything that is less than 32 Cores will have scaled down workload
scale_factor = max((32.0f / (ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount) ), 1.0f);
n = (int)rint( (float)n / scale_factor );

printf("> CUDA Capable: SM %d.%d hardware\n", deviceProp.major, deviceProp.minor);
printf("> %d Multiprocessor(s) x %d (Cores/Multiprocessor) = %d (Cores)\n",
deviceProp.multiProcessorCount,
ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);

printf("> scale_factor = %1.4f\n", 1.0f/scale_factor);
printf("> array_size = %d\n\n", n);

// enable use of blocking sync, to reduce CPU usage
printf("> Using CPU/GPU Device Synchronization method (%s)\n", sDeviceSyncMethod[device_sync_method]);
cudaSetDeviceFlags( device_sync_method | (bPinGenericMemory ? cudaDeviceMapHost : 0 ) );

// allocate host memory
int c = 5; // value to which the array will be initialized
int *h_a = 0;
int *h_b = 0;// pointer to the array data in host memory
int *h_c = 0;
int *hAligned_a = 0;
int *hAligned_b = 0;
int *hAligned_c = 0;
// pointer to the array data in host memory (aligned to MEMORY_ALIGNMENT)

// Allocate Host memory (could be using cudaMallocHost or VirtualAlloc/mmap if using the new CUDA 4.0 features
AllocateHostMemory(bPinGenericMemory, &h_a, &hAligned_a, nbytes);
AllocateHostMemory(bPinGenericMemory, &h_b, &hAligned_b, nbytes);
AllocateHostMemory(bPinGenericMemory, &h_c, &hAligned_c, nbytes);
for(int i = 0; i < n; i++)
{
h_a[i] = rand() / (int)RAND_MAX;
h_b[i] = rand() / (int)RAND_MAX;
}
// allocate device memory
int *d_a = 0, *d_c = 0,*d_b=0; // pointers to data and init value in the device memory
MY_SAFE_CALL(cudaMalloc(&d_a, nbytes));
MY_SAFE_CALL(cudaMalloc(&d_b, nbytes));
MY_SAFE_CALL(cudaMalloc(&d_c, nbytes));
checkCudaErrors( cudaMemcpy(d_c, &c, sizeof(int), cudaMemcpyHostToDevice) );

printf("\nStarting Test\n");

// allocate and initialize an array of stream handles
cudaStream_t *streams = (cudaStream_t*) malloc(nstreams * sizeof(cudaStream_t));
for(int i = 0; i < nstreams; i++) {
checkCudaErrors( cudaStreamCreate(&(streams[i])) );
}

// create CUDA event handles
// use blocking sync
cudaEvent_t start_event, stop_event;
int eventflags = ( (device_sync_method == cudaDeviceBlockingSync) ? cudaEventBlockingSync: cudaEventDefault );

checkCudaErrors( cudaEventCreateWithFlags(&start_event, eventflags) );
checkCudaErrors( cudaEventCreateWithFlags(&stop_event, eventflags) );

// time memcopy from device
cudaEventRecord(start_event, 0); // record in stream-0, to ensure that all previous CUDA calls have completed
cudaMemcpyAsync(hAligned_a, d_a, nbytes, cudaMemcpyDeviceToHost, streams[0]);
cudaMemcpyAsync(hAligned_b, d_b, nbytes, cudaMemcpyDeviceToHost, streams[0]);
cudaMemcpyAsync(hAligned_c, d_c, nbytes, cudaMemcpyDeviceToHost, streams[0]);
cudaEventRecord(stop_event, 0);
cudaEventSynchronize(stop_event); // block until the event is actually recorded
checkCudaErrors( cudaEventElapsedTime(&time_memcpy, start_event, stop_event) );
printf("memcopy:\t%.2f\n", time_memcpy);

// time kernel
threads=dim3(512, 1);
blocks=dim3(n / threads.x, 1);
cudaEventRecord(start_event, 0);
vectorAddGPU<<<blocks, threads, 0, streams[0]>>>(d_a,d_b, d_c, n,nreps);
cudaEventRecord(stop_event, 0);
cudaEventSynchronize(stop_event);
checkCudaErrors( cudaEventElapsedTime(&time_kernel, start_event, stop_event) );
printf("kernel:\t\t%.2f\n", time_kernel);

//////////////////////////////////////////////////////////////////////
// time non-streamed execution for reference
threads=dim3(512, 1);
blocks=dim3(n / threads.x, 1);
cudaEventRecord(start_event, 0);

cudaMemcpy(d_a, h_a, nbytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, nbytes, cudaMemcpyHostToDevice);
vectorAddGPU<<<blocks, threads>>>(d_a,d_b, d_c, n,nreps);
cudaMemcpy(hAligned_c, d_c, nbytes, cudaMemcpyDeviceToHost);

cudaEventRecord(stop_event, 0);
cudaEventSynchronize(stop_event);
checkCudaErrors( cudaEventElapsedTime(&elapsed_time, start_event, stop_event) );
printf("non-streamed:\t%.2f (%.2f expected)\n", elapsed_time, time_kernel + time_memcpy);

//////////////////////////////////////////////////////////////////////
// time execution with nstreams streams
threads=dim3(512,1);
blocks=dim3(n/(nstreams*threads.x),1);
// memset(hAligned_a, 255, nbytes); // set host memory bits to all 1s, for testing correctness
// cudaMemset(d_a, 0, nbytes); // set device memory to all 0s, for testing correctness
cudaEventRecord(start_event, 0);

cudaMemcpy(d_a,h_a, nbytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_b,h_b, nbytes , cudaMemcpyHostToDevice);
// asynchronously launch nstreams kernels, each operating on its own portion of data
for(int i = 0; i < nstreams; i++){
// cudaMemcpyAsync(d_a + i * n / nstreams,h_a + i * n / nstreams, nbytes / nstreams, cudaMemcpyHostToDevice, streams[i]);
// cudaMemcpyAsync(d_b + i * n / nstreams,h_b + i * n / nstreams, nbytes / nstreams, cudaMemcpyHostToDevice, streams[i]);
vectorAddGPU<<<blocks, threads, 0, streams[i]>>>(d_a + i * n / nstreams, d_b + i * n / nstreams,d_c + i * n / nstreams, n/nstreams,nreps);
cudaMemcpyAsync(h_c + i * n / nstreams, d_c + i * n / nstreams, nbytes / nstreams, cudaMemcpyDeviceToHost, streams[i]);

// asynchronously launch nstreams memcopies. Note that memcopy in stream x will only
// commence executing when all previous CUDA calls in stream x have completed
// for(int i = 0; i < nstreams; i++)

}
cudaEventRecord(stop_event, 0);
cudaEventSynchronize(stop_event);
checkCudaErrors( cudaEventElapsedTime(&elapsed_time, start_event, stop_event) );
printf("%d streams:\t%.2f (%.2f expected with compute capability 1.1 or later)\n", nstreams, elapsed_time , time_kernel + time_memcpy / nstreams);

// check whether the output is correct
printf("-------------------------------\n");
bool bResults = correct_data(hAligned_a,hAligned_b,h_c, n);

// release resources
for(int i = 0; i < nstreams; i++) {
cudaStreamDestroy(streams[i]);
}
cudaEventDestroy(start_event);
cudaEventDestroy(stop_event);

// Free cudaMallocHost or Generic Host allocated memory (from CUDA 4.0)
FreeHostMemory( bPinGenericMemory, &h_a, &hAligned_a, nbytes );

cudaFree(d_a);
cudaFree(d_c);

cudaDeviceReset();
shrQAFinishExit(argc, (const char **)argv, bResults ? QA_PASSED : QA_FAILED);
}

[/code]
In order to figure out what is going wrong I created a small test case with a single GPU by modifying the SDK example.

I can get 2-way concurrency(kernel+d2h)but not three-way(h2d+kernel+d2h).

I am attaching the code that produces 2-way concurrency.

If I use MemcpyAsync for the H2D transfers the results are correct but the whole process get serialized.

The same thing happened In the CUBLAS code from the webinar,I don't know why.

Any ideas?

I am terribly buffled by this.please note that in the multi-GPU case it isn't practical to use 2-way concurrency since I have to wait for the H2D transfer to complete before I change context.

Thanks in advance,



Apostolis





/*

* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.

*

* Please refer to the NVIDIA end user license agreement (EULA) associated

* with this source code for terms and conditions that govern your use of

* this software. Any use, reproduction, disclosure, or distribution of

* this software and related documentation outside the terms of the EULA

* is strictly prohibited.

*

*

*

* This sample illustrates the usage of CUDA streams for overlapping

* kernel execution with device/host memcopies. The kernel is used to

* initialize an array to a specific value, after which the array is

* copied to the host (CPU) memory. To increase performance, multiple

* kernel/memcopy pairs are launched asynchronously, each pair in its

* own stream. Devices with Compute Capability 1.1 can overlap a kernel

* and a memcopy as long as they are issued in different streams. Kernels

* are serialized. Thus, if n pairs are launched, streamed approach

* can reduce the memcopy cost to the (1/n)th of a single copy of the entire

* data set.

*

* Additionally, this sample uses CUDA events to measure elapsed time for

* CUDA calls. Events are a part of CUDA API and provide a system independent

* way to measure execution times on CUDA devices with approximately 0.5

* microsecond precision.

*

* Elapsed times are averaged over nreps repetitions (10 by default).

*

*/



const char *sSDKsample = "simpleStreams";



const char *sEventSyncMethod[] =

{

"cudaEventDefault",

"cudaEventBlockingSync",

"cudaEventDisableTiming",

NULL

};



const char *sDeviceSyncMethod[] =

{

"cudaDeviceScheduleAuto",

"cudaDeviceScheduleSpin",

"cudaDeviceScheduleYield",

"INVALID",

"cudaDeviceScheduleBlockingSync",

NULL

};



// Include headers

#include <stdio.h>



// CUDA utilities and system includes

#include <cuda_runtime.h>



// Shared Library Test Functions

#include <sdkHelper.h> // helper for shared that are common to CUDA SDK samples

#include <shrUtils.h>

#include <shrQATest.h>

#include <my_cuda_lib/my_lib.cu>



#ifndef WIN32

#include <sys/mman.h> // for mmap() / munmap()

#endif



////////////////////////////////////////////////////////////////////////////////

// These are CUDA Helper functions



// This will output the proper CUDA error strings in the event that a CUDA host call returns an error

#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)



inline void __checkCudaErrors(cudaError err, const char *file, const int line )

{

if(cudaSuccess != err)

{

fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",file, line, (int)err, cudaGetErrorString( err ) );

exit(-1);

}

}



// This will output the proper error string when calling cudaGetLastError

#define getLastCudaError(msg) __getLastCudaError (msg, __FILE__, __LINE__)



inline void __getLastCudaError(const char *errorMessage, const char *file, const int line )

{

cudaError_t err = cudaGetLastError();

if (cudaSuccess != err)

{

fprintf(stderr, "%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n",

file, line, errorMessage, (int)err, cudaGetErrorString( err ) );

exit(-1);

}

}



// General GPU Device CUDA Initialization

int gpuDeviceInit(int devID)

{

int deviceCount;

checkCudaErrors(cudaGetDeviceCount(&deviceCount));



if (deviceCount == 0)

{

fprintf(stderr, "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n");

exit(-1);

}



if (devID < 0)

devID = 0;



if (devID > deviceCount-1)

{

fprintf(stderr, "\n");

fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);

fprintf(stderr, ">> gpuDeviceInit (-device=%d) is not a valid GPU device. <<\n", devID);

fprintf(stderr, "\n");

return -devID;

}



cudaDeviceProp deviceProp;

checkCudaErrors( cudaGetDeviceProperties(&deviceProp, devID) );



if (deviceProp.major < 1)

{

fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n");

exit(-1);

}



checkCudaErrors( cudaSetDevice(devID) );

printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, deviceProp.name);



return devID;

}



// This function returns the best GPU (with maximum GFLOPS)

int gpuGetMaxGflopsDeviceId()

{

int current_device = 0, sm_per_multiproc = 0;

int max_compute_perf = 0, max_perf_device = 0;

int device_count = 0, best_SM_arch = 0;

cudaDeviceProp deviceProp;

cudaGetDeviceCount( &device_count );



// Find the best major SM Architecture GPU device

while (current_device < device_count)

{

cudaGetDeviceProperties( &deviceProp, current_device );

if (deviceProp.major > 0 && deviceProp.major < 9999)

{

best_SM_arch = MAX(best_SM_arch, deviceProp.major);

}

current_device++;

}



// Find the best CUDA capable GPU device

current_device = 0;

while( current_device < device_count )

{

cudaGetDeviceProperties( &deviceProp, current_device );

if (deviceProp.major == 9999 && deviceProp.minor == 9999)

{

sm_per_multiproc = 1;

}

else

{

sm_per_multiproc = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);

}



int compute_perf = deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;



if( compute_perf > max_compute_perf )

{

// If we find GPU with SM major > 2, search only these

if ( best_SM_arch > 2 )

{

// If our device==dest_SM_arch, choose this, or else pass

if (deviceProp.major == best_SM_arch)

{

max_compute_perf = compute_perf;

max_perf_device = current_device;

}

}

else

{

max_compute_perf = compute_perf;

max_perf_device = current_device;

}

}

++current_device;

}

return max_perf_device;

}





// Initialization code to find the best CUDA Device

int findCudaDevice(int argc, const char **argv)

{

cudaDeviceProp deviceProp;

int devID = 0;

// If the command-line has a device number specified, use it

if (checkCmdLineFlag(argc, argv, "device"))

{

devID = getCmdLineArgumentInt(argc, argv, "device=");

if (devID < 0)

{

printf("Invalid command line parameter\n ");

exit(-1);

}

else

{

devID = gpuDeviceInit(devID);

if (devID < 0)

{

printf("exiting...\n");

shrQAFinishExit(argc, (const char **)argv, QA_FAILED);

exit(-1);

}

}

}

else

{

// Otherwise pick the device with highest Gflops/s

devID = gpuGetMaxGflopsDeviceId();

checkCudaErrors( cudaSetDevice( devID ) );

checkCudaErrors( cudaGetDeviceProperties(&deviceProp, devID) );

printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);

}

return devID;

}

// end of CUDA Helper Functions





// Macro to aligned up to the memory size in question

#define MEMORY_ALIGNMENT 4096

#define ALIGN_UP(x,size) ( ((size_t)x+(size-1))&(~(size-1)) )



__global__ void vectorAddGPU(int *a, int *b, int *c, int N,int nIterations)

{

int idx = blockIdx.x*blockDim.x + threadIdx.x;

// for( int i=0;i<nIterations;i++){

if (idx < N){



c[idx] = a[idx] + b[idx];}

}

// }



int correct_data(int *a,int*b,int*c, const int n)

{

for(int i = 0; i < n; i++) {

if(c[i] != a[i]+b[i]) {

printf("%d: %d %d\n", i, a[i], c);

return 0;

}

}

return 1;

}



inline void

AllocateHostMemory(bool bPinGenericMemory, int **pp_a, int **ppAligned_a, int nbytes)

{

#if CUDART_VERSION >= 4000

if (bPinGenericMemory)

{

// allocate a generic page-aligned chunk of system memory

#ifdef WIN32

printf("> VirtualAlloc() allocating %4.2f Mbytes of (generic page-aligned system memory)\n", (float)nbytes/1048576.0f);

*pp_a = (int *) VirtualAlloc( NULL, (nbytes + MEMORY_ALIGNMENT), MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE );

#else

printf("> mmap() allocating %4.2f Mbytes (generic page-aligned system memory)\n", (float)nbytes/1048576.0f);

*pp_a = (int *) mmap ( NULL, (nbytes + MEMORY_ALIGNMENT), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0 );

#endif



*ppAligned_a = (int *)ALIGN_UP(*pp_a, MEMORY_ALIGNMENT);



printf("> cudaHostRegister() registering %4.2f Mbytes of generic allocated system memory\n", (float)nbytes/1048576.0f);

// pin allocate memory

checkCudaErrors( cudaHostRegister(*ppAligned_a, nbytes, cudaHostRegisterMapped) );

}

else

#endif

{

printf("> cudaMallocHost() allocating %4.2f Mbytes of system memory\n", (float)nbytes/1048576.0f);

// allocate host memory (pinned is required for achieve asynchronicity)

checkCudaErrors( cudaMallocHost((void**)pp_a, nbytes) );

*ppAligned_a = *pp_a;

}

}



inline void

FreeHostMemory(bool bPinGenericMemory, int **pp_a, int **ppAligned_a, int nbytes)

{

#if CUDART_VERSION >= 4000

// CUDA 4.0 support pinning of generic host memory

if (bPinGenericMemory)

{

// unpin and delete host memory

checkCudaErrors( cudaHostUnregister(*ppAligned_a) );

#ifdef WIN32

VirtualFree(*pp_a, 0, MEM_RELEASE);

#else

munmap(*pp_a, nbytes);

#endif

}

else

#endif

{

cudaFreeHost(*pp_a);

}

}



static char *sSyncMethod[] =

{

"0 (Automatic Blocking)",

"1 (Spin Blocking)",

"2 (Yield Blocking)",

"3 (Undefined Blocking Method)",

"4 (Blocking Sync Event) = low CPU utilization",

NULL

};



void printHelp()

{

printf("Usage: %s [options below]\n", sSDKsample);

printf("\t--sync_method=n for CPU/GPU synchronization\n");

printf("\t n=%s\n", sSyncMethod[0]);

printf("\t n=%s\n", sSyncMethod[1]);

printf("\t n=%s\n", sSyncMethod[2]);

printf("\t <Default> n=%s\n", sSyncMethod[4]);

printf("\t--use_generic_memory (default) use generic page-aligned for system memory\n");

printf("\t--use_cuda_malloc_host (optional) use cudaMallocHost to allocate system memory\n");

}



#if defined(__APPLE__) || defined(MACOSX)

#define DEFAULT_PINNED_GENERIC_MEMORY false

#else

#define DEFAULT_PINNED_GENERIC_MEMORY true

#endif



int main(int argc, char **argv)

{

int cuda_device = 0;

int nstreams = 4; // number of streams for CUDA calls

int nreps = 1; // number of times each experiment is repeated

int n = 16 * 1024 * 1024; // number of ints in the data set

int nbytes = n * sizeof(int); // number of data bytes

dim3 threads, blocks; // kernel launch configuration

float elapsed_time, time_memcpy, time_kernel; // timing variables

float scale_factor = 1.0f;



// allocate generic memory and pin it laster instead of using cudaHostAlloc()



bool bPinGenericMemory = DEFAULT_PINNED_GENERIC_MEMORY; // we want this to be the default behavior

int device_sync_method = cudaDeviceBlockingSync; // by default we use BlockingSync



int niterations; // number of iterations for the loop inside the kernel



shrQAStart(argc, argv);



printf("[ %s ]\n\n", sSDKsample);

if( checkCmdLineFlag( argc, (const char **)argv, "help") ) {

printHelp();

shrQAFinishExit(argc, (const char **)argv, QA_PASSED);

}

if( (device_sync_method = getCmdLineArgumentInt( argc, (const char **)argv, "sync_method" )) >= 0 ) {

if (device_sync_method == 0 || device_sync_method == 1 || device_sync_method == 2 || device_sync_method == 4) {

printf("Device synchronization method set to = %s\n", sSyncMethod[device_sync_method]);

printf("Setting reps to %d to demonstrate steady state\n",nreps);

nreps = 100;

} else {

printf("Invalid command line option sync_method=\"%d\"\n", device_sync_method);

shrQAFinishExit(argc, (const char **)argv, QA_FAILED);

}

} else {

printHelp();

shrQAFinishExit(argc, (const char **)argv, QA_PASSED);

}



if( checkCmdLineFlag( argc, (const char **)argv, "use_generic_memory") ) {

#if defined(__APPLE__) || defined(MACOSX)

bPinGenericMemory = false; // Generic Pinning of System Paged memory not currently supported on Mac OSX

#else

bPinGenericMemory = true;

#endif

}

if( checkCmdLineFlag( argc, (const char **)argv, "use_cuda_malloc_host") ) {

bPinGenericMemory = false;

}



printf("\n> ");

cuda_device = findCudaDevice(argc, (const char **)argv);



// check the compute capability of the device

int num_devices=0;

checkCudaErrors( cudaGetDeviceCount(&num_devices) );

if(0==num_devices)

{

printf("your system does not have a CUDA capable device, waiving test...\n");

shrQAFinishExit(argc, (const char **)argv, QA_PASSED);

}



// check if the command-line chosen device ID is within range, exit if not

if( cuda_device >= num_devices )

{

printf("cuda_device=%d is invalid, must choose device ID between 0 and %d\n", cuda_device, num_devices-1);

shrQAFinishExit(argc, (const char **)argv, QA_FAILED);

}



cudaSetDevice( cuda_device );



// Checking for compute capabilities

cudaDeviceProp deviceProp;

checkCudaErrors( cudaGetDeviceProperties(&deviceProp, cuda_device) );

if( (1 == deviceProp.major) && (deviceProp.minor < 1)) {

printf("%s does not have Compute Capability 1.1 or newer. Reducing workload.\n", deviceProp.name);

}



if(deviceProp.major >= 2) {

niterations = 100;

} else {

if(deviceProp.minor > 1) {

niterations = 5;

} else {

niterations = 1; // reduced workload for compute capability 1.0 and 1.1

}

}



// Check if GPU can map host memory (Generic Method), if not then we override bPinGenericMemory to be false

if (bPinGenericMemory) {

printf("Device: <%s> canMapHostMemory: %s\n", deviceProp.name, deviceProp.canMapHostMemory ? "Yes" : "No");

if (deviceProp.canMapHostMemory == 0) {

printf("Using cudaMallocHost, CUDA device does not support mapping of generic host memory\n");

bPinGenericMemory = false;

}

}



// Anything that is less than 32 Cores will have scaled down workload

scale_factor = max((32.0f / (ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount) ), 1.0f);

n = (int)rint( (float)n / scale_factor );



printf("> CUDA Capable: SM %d.%d hardware\n", deviceProp.major, deviceProp.minor);

printf("> %d Multiprocessor(s) x %d (Cores/Multiprocessor) = %d (Cores)\n",

deviceProp.multiProcessorCount,

ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),

ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);



printf("> scale_factor = %1.4f\n", 1.0f/scale_factor);

printf("> array_size = %d\n\n", n);



// enable use of blocking sync, to reduce CPU usage

printf("> Using CPU/GPU Device Synchronization method (%s)\n", sDeviceSyncMethod[device_sync_method]);

cudaSetDeviceFlags( device_sync_method | (bPinGenericMemory ? cudaDeviceMapHost : 0 ) );



// allocate host memory

int c = 5; // value to which the array will be initialized

int *h_a = 0;

int *h_b = 0;// pointer to the array data in host memory

int *h_c = 0;

int *hAligned_a = 0;

int *hAligned_b = 0;

int *hAligned_c = 0;

// pointer to the array data in host memory (aligned to MEMORY_ALIGNMENT)



// Allocate Host memory (could be using cudaMallocHost or VirtualAlloc/mmap if using the new CUDA 4.0 features

AllocateHostMemory(bPinGenericMemory, &h_a, &hAligned_a, nbytes);

AllocateHostMemory(bPinGenericMemory, &h_b, &hAligned_b, nbytes);

AllocateHostMemory(bPinGenericMemory, &h_c, &hAligned_c, nbytes);

for(int i = 0; i < n; i++)

{

h_a[i] = rand() / (int)RAND_MAX;

h_b[i] = rand() / (int)RAND_MAX;

}

// allocate device memory

int *d_a = 0, *d_c = 0,*d_b=0; // pointers to data and init value in the device memory

MY_SAFE_CALL(cudaMalloc(&d_a, nbytes));

MY_SAFE_CALL(cudaMalloc(&d_b, nbytes));

MY_SAFE_CALL(cudaMalloc(&d_c, nbytes));

checkCudaErrors( cudaMemcpy(d_c, &c, sizeof(int), cudaMemcpyHostToDevice) );



printf("\nStarting Test\n");



// allocate and initialize an array of stream handles

cudaStream_t *streams = (cudaStream_t*) malloc(nstreams * sizeof(cudaStream_t));

for(int i = 0; i < nstreams; i++) {

checkCudaErrors( cudaStreamCreate(&(streams[i])) );

}



// create CUDA event handles

// use blocking sync

cudaEvent_t start_event, stop_event;

int eventflags = ( (device_sync_method == cudaDeviceBlockingSync) ? cudaEventBlockingSync: cudaEventDefault );



checkCudaErrors( cudaEventCreateWithFlags(&start_event, eventflags) );

checkCudaErrors( cudaEventCreateWithFlags(&stop_event, eventflags) );



// time memcopy from device

cudaEventRecord(start_event, 0); // record in stream-0, to ensure that all previous CUDA calls have completed

cudaMemcpyAsync(hAligned_a, d_a, nbytes, cudaMemcpyDeviceToHost, streams[0]);

cudaMemcpyAsync(hAligned_b, d_b, nbytes, cudaMemcpyDeviceToHost, streams[0]);

cudaMemcpyAsync(hAligned_c, d_c, nbytes, cudaMemcpyDeviceToHost, streams[0]);

cudaEventRecord(stop_event, 0);

cudaEventSynchronize(stop_event); // block until the event is actually recorded

checkCudaErrors( cudaEventElapsedTime(&time_memcpy, start_event, stop_event) );

printf("memcopy:\t%.2f\n", time_memcpy);



// time kernel

threads=dim3(512, 1);

blocks=dim3(n / threads.x, 1);

cudaEventRecord(start_event, 0);

vectorAddGPU<<<blocks, threads, 0, streams[0]>>>(d_a,d_b, d_c, n,nreps);

cudaEventRecord(stop_event, 0);

cudaEventSynchronize(stop_event);

checkCudaErrors( cudaEventElapsedTime(&time_kernel, start_event, stop_event) );

printf("kernel:\t\t%.2f\n", time_kernel);



//////////////////////////////////////////////////////////////////////

// time non-streamed execution for reference

threads=dim3(512, 1);

blocks=dim3(n / threads.x, 1);

cudaEventRecord(start_event, 0);



cudaMemcpy(d_a, h_a, nbytes, cudaMemcpyHostToDevice);

cudaMemcpy(d_b, h_b, nbytes, cudaMemcpyHostToDevice);

vectorAddGPU<<<blocks, threads>>>(d_a,d_b, d_c, n,nreps);

cudaMemcpy(hAligned_c, d_c, nbytes, cudaMemcpyDeviceToHost);



cudaEventRecord(stop_event, 0);

cudaEventSynchronize(stop_event);

checkCudaErrors( cudaEventElapsedTime(&elapsed_time, start_event, stop_event) );

printf("non-streamed:\t%.2f (%.2f expected)\n", elapsed_time, time_kernel + time_memcpy);



//////////////////////////////////////////////////////////////////////

// time execution with nstreams streams

threads=dim3(512,1);

blocks=dim3(n/(nstreams*threads.x),1);

// memset(hAligned_a, 255, nbytes); // set host memory bits to all 1s, for testing correctness

// cudaMemset(d_a, 0, nbytes); // set device memory to all 0s, for testing correctness

cudaEventRecord(start_event, 0);



cudaMemcpy(d_a,h_a, nbytes, cudaMemcpyHostToDevice);

cudaMemcpy(d_b,h_b, nbytes , cudaMemcpyHostToDevice);

// asynchronously launch nstreams kernels, each operating on its own portion of data

for(int i = 0; i < nstreams; i++){

// cudaMemcpyAsync(d_a + i * n / nstreams,h_a + i * n / nstreams, nbytes / nstreams, cudaMemcpyHostToDevice, streams[i]);

// cudaMemcpyAsync(d_b + i * n / nstreams,h_b + i * n / nstreams, nbytes / nstreams, cudaMemcpyHostToDevice, streams[i]);

vectorAddGPU<<<blocks, threads, 0, streams[i]>>>(d_a + i * n / nstreams, d_b + i * n / nstreams,d_c + i * n / nstreams, n/nstreams,nreps);

cudaMemcpyAsync(h_c + i * n / nstreams, d_c + i * n / nstreams, nbytes / nstreams, cudaMemcpyDeviceToHost, streams[i]);



// asynchronously launch nstreams memcopies. Note that memcopy in stream x will only

// commence executing when all previous CUDA calls in stream x have completed

// for(int i = 0; i < nstreams; i++)



}

cudaEventRecord(stop_event, 0);

cudaEventSynchronize(stop_event);

checkCudaErrors( cudaEventElapsedTime(&elapsed_time, start_event, stop_event) );

printf("%d streams:\t%.2f (%.2f expected with compute capability 1.1 or later)\n", nstreams, elapsed_time , time_kernel + time_memcpy / nstreams);



// check whether the output is correct

printf("-------------------------------\n");

bool bResults = correct_data(hAligned_a,hAligned_b,h_c, n);



// release resources

for(int i = 0; i < nstreams; i++) {

cudaStreamDestroy(streams[i]);

}

cudaEventDestroy(start_event);

cudaEventDestroy(stop_event);



// Free cudaMallocHost or Generic Host allocated memory (from CUDA 4.0)

FreeHostMemory( bPinGenericMemory, &h_a, &hAligned_a, nbytes );



cudaFree(d_a);

cudaFree(d_c);



cudaDeviceReset();

shrQAFinishExit(argc, (const char **)argv, bResults ? QA_PASSED : QA_FAILED);

}


#8
Posted 04/26/2012 09:30 PM   
Which GPU are you using for your test?
Only recent Tesla and Quadro have two DMA engines to overlap I/O and compute in both directions.
Which GPU are you using for your test?

Only recent Tesla and Quadro have two DMA engines to overlap I/O and compute in both directions.

#9
Posted 04/26/2012 09:54 PM   
NVIDIA GTS450.and for the Multi-GPU case it is coupled with a GT430.
Is the dual DMA a Tesla feature or do normal desktop GPUs have it also?
NVIDIA GTS450.and for the Multi-GPU case it is coupled with a GT430.

Is the dual DMA a Tesla feature or do normal desktop GPUs have it also?

#10
Posted 04/27/2012 01:09 PM   
Geforce cards have a single DMA.
Geforce cards have a single DMA.

#11
Posted 04/27/2012 01:43 PM   
Which means that I can only overlap kernel execution with data transfer?
Or can I overlap device-2-host with device-2-device but not with a kernel at the same time?

That has been really helpful.

Is this mentioned anywhere in the programming guide?
Which means that I can only overlap kernel execution with data transfer?

Or can I overlap device-2-host with device-2-device but not with a kernel at the same time?



That has been really helpful.



Is this mentioned anywhere in the programming guide?

#12
Posted 04/27/2012 02:11 PM   
On Geforce , you can overlap kernel execution with data transfer in one direction.

It should be in the programming guide, not sure where.
On Geforce , you can overlap kernel execution with data transfer in one direction.



It should be in the programming guide, not sure where.

#13
Posted 04/27/2012 02:35 PM   
Or use a kernel that transfers data in the other direction via zerocopy.
Or use a kernel that transfers data in the other direction via zerocopy.

Always check return codes of CUDA calls for errors. Do not use __syncthreads() in conditional code unless the condition is guaranteed to evaluate identically for all threads of each block. Run your program under cuda-memcheck to detect stray memory accesses. If your kernel dies for larger problem sizes, it might exceed the runtime limit and trigger the watchdog timer.

#14
Posted 04/27/2012 02:39 PM   
[quote name='tera' date='27 April 2012 - 04:39 PM' timestamp='1335537571' post='1401593']
Or use a kernel that transfers data in the other direction via zerocopy.
[/quote]

If it is a hardware limitation that shouldn't work either.
If it works it means that it is just a software limitation.
I will try it.
[quote name='tera' date='27 April 2012 - 04:39 PM' timestamp='1335537571' post='1401593']

Or use a kernel that transfers data in the other direction via zerocopy.





If it is a hardware limitation that shouldn't work either.

If it works it means that it is just a software limitation.

I will try it.

#15
Posted 04/27/2012 03:20 PM   
  1 / 2    
Scroll To Top