Concurrent Kernel using GTX 570 on WinXp Concurrent Kernel
Hi all,
I am working on concurrent kernel now.
I tried a program to test concurrent kernel on my desktop(winXp, VS2008, CUDA 3.2, GTX 570).
But I don't see much speedup.
Can anyone give me some advice?
Thank you very much.

The code is as followed:
#include <stdio.h>
#include <cuda.h>
#include <cuPrintf.cu>
#include "cutil_inline.h"
#include "cuda_runtime.h"
#include <iostream>
#include <cutil.h>
#include "cudaHelper.h"


using namespace std;

__global__ void mykernel( int *a, int n )
{
int idx = threadIdx.x;
int value = 1;
for(int i=0; i<n; i++)
value *= sin( (float)i ) + tan( (float)i );
a[idx] = value;
}
int main(int argc, const char **argv)
{
int nblocks = 4;
int nthreads = 64;
int n = 500000;
int nkernels = 8;
int nbytes;

int devID;
cudaDeviceProp deviceProps;

int * d_A=0;
int * d_B=0;
cudaStream_t *stream;
cudaEvent_t start, stop;
float elapsedTime;

int qatest = 0;

printf("[concurrentKernels] - Starting...\n\n");
devID = 0;
cutilSafeCall(cudaSetDevice(devID));
// QA testing mode
if (cutCheckCmdLineFlag(argc, (const char**)argv, "qatest"))
{
qatest = 1;
}
cutilSafeCall(cudaGetDeviceProperties(&deviceProps, devID));
printf("CUDA Device %s has %d Multi-Processors\n", deviceProps.name, deviceProps.multiProcessorCount);
printf("CUDA Device %s is%s capable of concurrent kernel execution\n", deviceProps.name, (deviceProps.concurrentKernels==0)?" NOT":"");
stream = (cudaStream_t *)malloc(nkernels * sizeof(cudaStream_t));
//create streams
for(int i=0; i<nkernels; i++)
{
cutilSafeCall(cudaStreamCreate(&stream[i]));
}

// note: in this sample we will repeatedly overwrite the same
// block of device mem, but that's okay because we don't really
// care about the output of the kernel for the purposes of this
// example.

nbytes = nkernels * nthreads * sizeof(int);
cutilSafeCall(cudaMalloc((void **)&d_A, nbytes));

cutilSafeCall(cudaMalloc((void **)&d_B, nbytes));

cutilSafeCall(cudaEventCreate(&start));
cutilSafeCall(cudaEventCreate(&stop));

// start timer then launch all kernels in their streams
cutilSafeCall(cudaEventRecord(start, 0));
for(int i=0; i<nkernels; i++)
{
// avoid synchronization points (events, error checks, etc.) inside
// this loop in order to get concurrent execution on devices that support it
mykernel<<<nblocks, nthreads, 0, stream[i]>>>(&d_A[i*nthreads], n);
}

cutilSafeCall(cudaEventRecord(stop, 0));
// wait for all streams to finish
cutilSafeCall(cudaEventSynchronize(stop));
// get total time for all kernels
cutilSafeCall(cudaEventElapsedTime(&elapsedTime, start, stop));
printf("\nAll %d kernels together took %.3fs\n", nkernels, elapsedTime/1000.f);

cudaEvent_t start1, stop1;

float elapsedTime1 = 0.0;
cutilSafeCall(cudaEventCreate(&start1));
cutilSafeCall(cudaEventCreate(&stop1));
// check time to execute a single iteration
cutilSafeCall(cudaEventRecord(start1, 0));

mykernel<<<nblocks, nthreads>>>(d_B,n);

cutilCheckMsg("kernel launch failure");
cutilSafeCall(cudaEventRecord(stop1, 0));
cutilSafeCall(cudaEventSynchronize(stop1));
cutilSafeCall(cudaEventElapsedTime(&elapsedTime1, start1, stop1));

printf("if no concurrent execution the time is %.3fs\n",elapsedTime1/1000.f);

// cleanup
printf("\nCleaning up...\n");
cudaEventDestroy(start);
cudaEventDestroy(stop);
if (stream)
{
for(int i=0; i<nkernels; i++)
{
cutilSafeCall(cudaStreamDestroy(stream[i]));
}
free(stream);
}
if (d_A)
cudaFree(d_A);
if (d_B)
cudaFree(d_B);

if (qatest)
{
// any errors that might have happened will have already been reported
printf("[concurrentKernels] - Test Results:\nPASSED\n");
}
exit(0);
return 0;
}
Hi all,

I am working on concurrent kernel now.

I tried a program to test concurrent kernel on my desktop(winXp, VS2008, CUDA 3.2, GTX 570).

But I don't see much speedup.

Can anyone give me some advice?

Thank you very much.



The code is as followed:

#include <stdio.h>

#include <cuda.h>

#include <cuPrintf.cu>

#include "cutil_inline.h"

#include "cuda_runtime.h"

#include <iostream>

#include <cutil.h>

#include "cudaHelper.h"





using namespace std;



__global__ void mykernel( int *a, int n )

{

int idx = threadIdx.x;

int value = 1;

for(int i=0; i<n; i++)

value *= sin( (float)i ) + tan( (float)i );

a[idx] = value;

}

int main(int argc, const char **argv)

{

int nblocks = 4;

int nthreads = 64;

int n = 500000;

int nkernels = 8;

int nbytes;



int devID;

cudaDeviceProp deviceProps;



int * d_A=0;

int * d_B=0;

cudaStream_t *stream;

cudaEvent_t start, stop;

float elapsedTime;



int qatest = 0;



printf("[concurrentKernels] - Starting...\n\n");

devID = 0;

cutilSafeCall(cudaSetDevice(devID));

// QA testing mode

if (cutCheckCmdLineFlag(argc, (const char**)argv, "qatest"))

{

qatest = 1;

}

cutilSafeCall(cudaGetDeviceProperties(&deviceProps, devID));

printf("CUDA Device %s has %d Multi-Processors\n", deviceProps.name, deviceProps.multiProcessorCount);

printf("CUDA Device %s is%s capable of concurrent kernel execution\n", deviceProps.name, (deviceProps.concurrentKernels==0)?" NOT":"");

stream = (cudaStream_t *)malloc(nkernels * sizeof(cudaStream_t));

//create streams

for(int i=0; i<nkernels; i++)

{

cutilSafeCall(cudaStreamCreate(&stream[i]));

}



// note: in this sample we will repeatedly overwrite the same

// block of device mem, but that's okay because we don't really

// care about the output of the kernel for the purposes of this

// example.



nbytes = nkernels * nthreads * sizeof(int);

cutilSafeCall(cudaMalloc((void **)&d_A, nbytes));



cutilSafeCall(cudaMalloc((void **)&d_B, nbytes));



cutilSafeCall(cudaEventCreate(&start));

cutilSafeCall(cudaEventCreate(&stop));



// start timer then launch all kernels in their streams

cutilSafeCall(cudaEventRecord(start, 0));

for(int i=0; i<nkernels; i++)

{

// avoid synchronization points (events, error checks, etc.) inside

// this loop in order to get concurrent execution on devices that support it

mykernel<<<nblocks, nthreads, 0, stream[i]>>>(&d_A[i*nthreads], n);

}



cutilSafeCall(cudaEventRecord(stop, 0));

// wait for all streams to finish

cutilSafeCall(cudaEventSynchronize(stop));

// get total time for all kernels

cutilSafeCall(cudaEventElapsedTime(&elapsedTime, start, stop));

printf("\nAll %d kernels together took %.3fs\n", nkernels, elapsedTime/1000.f);



cudaEvent_t start1, stop1;



float elapsedTime1 = 0.0;

cutilSafeCall(cudaEventCreate(&start1));

cutilSafeCall(cudaEventCreate(&stop1));

// check time to execute a single iteration

cutilSafeCall(cudaEventRecord(start1, 0));



mykernel<<<nblocks, nthreads>>>(d_B,n);



cutilCheckMsg("kernel launch failure");

cutilSafeCall(cudaEventRecord(stop1, 0));

cutilSafeCall(cudaEventSynchronize(stop1));

cutilSafeCall(cudaEventElapsedTime(&elapsedTime1, start1, stop1));



printf("if no concurrent execution the time is %.3fs\n",elapsedTime1/1000.f);



// cleanup

printf("\nCleaning up...\n");

cudaEventDestroy(start);

cudaEventDestroy(stop);

if (stream)

{

for(int i=0; i<nkernels; i++)

{

cutilSafeCall(cudaStreamDestroy(stream[i]));

}

free(stream);

}

if (d_A)

cudaFree(d_A);

if (d_B)

cudaFree(d_B);



if (qatest)

{

// any errors that might have happened will have already been reported

printf("[concurrentKernels] - Test Results:\nPASSED\n");

}

exit(0);

return 0;

}

#1
Posted 10/10/2011 02:19 PM   
Scroll To Top