Hi all,
due the fact our application has to not be simply fast but it should perform
some operations with fixed deadlines (we analyze a continuous radio signal)
we perform several time per day benchmarks of all our algorithm.
We are experiencing a clear degradation adopting CUDA 4.1 over the old CUDA 4.0.
I have attached 4 images showing the historical performance data of 4 algorithms
(they are not all the affected ones, but the simplest to show you the kernel code).
For all graphs the reported time is in milliseconds (y-axis).
All kernels are launched in this way:
#define BLOCK_SIZE (1<<9)
dim3 myThreads(BLOCK_SIZE);
dim3 myGrid( (aSize + BLOCK_SIZE - 1) / BLOCK_SIZE);
Kernel<<< myGrid, myThreads>>>(…);
We have the C2050 cards with ECC off.
============================================================================
Sum of two complex vectors (2^20 complex)
__global__ void
VectorVectorSumKernelCC_O(const float2* aIn1,
const float2* aIn2,
float2* aOut,
const unsigned int aSize) {
const unsigned int myPos = blockIdx.x * blockDim.x + threadIdx.x;
if (myPos < aSize) {
aOut[myPos].x = aIn1[myPos].x + aIn2[myPos].x;
aOut[myPos].y = aIn1[myPos].y + aIn2[myPos].y;
}
}
============================================================================
Product of two complex vectors (2^20 complex)
__global__ void
MulKernel_cv_cv_o(const float2* aIn1,
const float2* aIn2,
float2* aOut,
const unsigned int aSize) {
const unsigned int myPos = blockIdx.x * blockDim.x + threadIdx.x;
if (myPos < aSize) {
const float myReal1 = aIn1[myPos].x;
const float myReal2 = aIn2[myPos].x;
const float myImag1 = aIn1[myPos].y;
const float myImag2 = aIn2[myPos].y;
aOut[myPos].x = myReal1 * myReal2 - myImag1 * myImag2;
aOut[myPos].y = myReal1 * myImag2 + myImag1 * myReal2;
}
}
============================================================================
Product of two complex vectors (2^20 complex), in place
__global__ void
MulKernel_cv_cv_i(const float2* aIn,
float2* aInOut,
const unsigned int aSize) {
const unsigned int myPos = blockIdx.x * blockDim.x + threadIdx.x;
if (myPos < aSize) {
const float myTmp = aInOut[myPos].x;
const float myInR = aIn[myPos].x;
const float myInI = aIn[myPos].y;
aInOut[myPos].x = myInR * aInOut[myPos].x - myInI * aInOut[myPos].y;
aInOut[myPos].y = myInR * aInOut[myPos].y + myInI * myTmp;
}
}
============================================================================
Tone generation (2^20 vector long)
__global__ void
ComplexExpKernel(float2* aInOut,
const unsigned int aSize,
const float aMagnitude,
const float aNormalizedFrequency,
const float aInverseFrequency,
const float aPhase) {
const unsigned int myPos = blockIdx.x * blockDim.x + threadIdx.x;
if (myPos < aSize) {
const float myArgument = aNormalizedFrequency * fmodf((float)myPos, aInverseFrequency) + aPhase;
aInOut[myPos].x = aMagnitude * __cosf(myArgument);
aInOut[myPos].y = aMagnitude * __sinf(myArgument);
}
}
============================================================================