I am wanting to understand shared memory better and how to use it for my application. I have read through some of the documents and its still not clear to me how to put the variables i need in shared memory and access them for faster performance.
I have written a basic generic kernel based on my application that doesn’t use shared memory and one that uses shared memory how i understand it.
non shared memory
#define twoPi 6.283185307179586
__global__ void delay(const float *array1,
const double *array2,
const float scalar1,
const float scalar2,
const float scalar3,
const float scalar4,
const floar offset,
const int integer1,
float *RealOutput,
float *ImagOuptut)
{
int idx;
idx = threadIdx.x;
float fractional;
double step;
double delay;
float real, imag;
float sumReal, sumImag;
double st;
step = (double)scalar1 + (double)scalar2 * ((double)idx/(double)integer1);
time = (double)scalar3 + (double)scalar4 *(double)idx + (double)offset;
for (int i = 0; i < 999; i++)
{
delay = step*(1.0 - array2[i]);
fractional = (float)(twoPi * (delay - trunc(delay)));
/* calculate the exponent. */
if (fractional < 0.0)
{
__sincosf(-fractional, &imag, &real);
imag = -imag;
} else {
__sincosf(fractional, &imag, &real);
}
sumReal = sumReal + array1[i] * real;
sumImag = sumImag + array1[i] * imag;
}
st = step*time;
fractional = -(float)(twoPi*(st - trunc(st)));
/* calculate the exponent. */
if (fractional < 0.0)
{
__sincosf(-fractional, &imag, &real);
imag = -imag;
} else {
__sincosf(fractional, &imag, &real);
}
RealOutput[idx] = sumReal * real - sumImag * imag;
ImagOutput[idx] = sumReal * imag + sumImag * real;
}
shared memory
#define twoPi 6.283185307179586
__global__ void delay(const float *array1,
const double *array2,
const float scalar1,
const float scalar2,
const float scalar3,
const float scalar4,
const float offset,
const int integer1,
float *RealOutput,
float *ImagOuptut)
{
int idx;
idx = threadIdx.x;
__shared__ float SharedArr1[999];
__shared__ double SharedArr2[999];
__shared__ float Shared1, Shared2, Shared3, Shared4, SharedOffset;
__shared__ int SharedInt;
/* populate the shared memory */
if (idx = 0)
{
for (int i = 0; i < 999; i++)
{
SharedArr1[i] = array1[i];
SharedArr2[i] = array2[i];
}
Shared1 = scalar1;
Shared2 = scalar2;
Shared3 = scalar3;
Shared4 = scalar4;
SharedOffset = offset;
SharedInt = integer1;
}
__syncthreads();
float fractional;
double step;
double delay;
float real, imag;
float sumReal, sumImag;
double st;
step = (double)Shared1 + (double)Shared2 * ((double)idx/(double)SharedInt);
time = (double)Shared3 + (double)Shared4 *(double)idx + (double)SharedOffset;
for (int i = 0; i < 999; i++)
{
delay = step*(1.0 - array2[i]);
fractional = (float)(twoPi * (delay - trunc(delay)));
/* calculate the exponent. */
if (fractional < 0.0)
{
__sincosf(-fractional, &imag, &real);
imag = -imag;
} else {
__sincosf(fractional, &imag, &real);
}
sumReal = sumReal + array1[i] * real;
sumImag = sumImag + array1[i] * imag;
}
st = step*time;
fractional = -(float)(twoPi*(st - trunc(st)));
/* calculate the exponent. */
if (fractional < 0.0)
{
__sincosf(-fractional, &imag, &real);
imag = -imag;
} else {
__sincosf(fractional, &imag, &real);
}
RealOutput[idx] = sumReal * real - sumImag * imag;
ImagOutput[idx] = sumReal * imag + sumImag * real;
}
The kernel call will be:
delay<<<1,512>>>(array1,… realOut, imagOut);
I have tried this implementation out on my applciation and it doesn’t work, I know its got something to do with my understanding. My question is, do i have the basic syntax right for distributing the variables into shared memory and accessing them appropriately? I know there is a limit to the amount of shared memory i have, i think this is 49152 bytes for my Tesla C2050 card. This should be anough for what i’m using here, 12010 bytes. Also since each thread accesses every element in both arrays, is there a better way to do this?