Im trying to understand how to use shared memory properly. I’m not sure i quite understand how to use shared memory for my application to make things faster.
I have a written up a somewhat generic kernel based on my application and anotehr kernel with what i think needs to be done to implement the shared memory part.
non shared memory example
#define twoPi 6.283185307179586
__global__ void delay(
const float *array1,
const double *array2,
const float scalar1,
const float scalar2,
const float scalar3,
const float scalar4,
const floar offset,
const int integer1,
float *RealOutput,
float *ImagOuptut)
{
int idx;
idx = threadIdx.x;
float fractional;
double step;
double delay;
float real, imag;
float sumReal, sumImag;
double st;
step = (double)scalar1 + (double)scalar2 * ((double)idx/(double)integer1);
time = (double)scalar3 + (double)scalar4 *(double)idx + (double)offset;
for (int i = 0; i < 999; i++)
{
delay = step*(1.0 - array2[i]);
fractional = (float)(twoPi * (delay - trunc(delay)));
/* calculate the exponenet. */
if (fractional < 0.0)
{
__sincosf(-fractional, &imag, &real);
imag = -imag;
} else {
__sincosf(fractional, &imag, &real);
}
sumReal = sumReal + array1[i] * real;
sumImag = sumImag + array1[i] * imag;
}
st = step*time;
fractional = -(float)(twoPi*(st - trunc(st)));
if (fractional < 0.0)
{
__sincosf(-fractional, &imag, &real);
imag = -imag;
} else {
__sincosf(fractional, &imag, &real);
}
RealOutput[idx] = sumReal * real - sumImag * imag;
ImagOutput[idx] = sumReal * imag + sumImag * real;
}
Shared memory example
#define twoPi 6.283185307179586
__global__ void delay(
const float *array1,
const double *array2,
const float scalar1,
const float scalar2,
const float scalar3,
const float scalar4,
const float offset,
const int integer1,
float *RealOutput,
float *ImagOuptut)
{
int idx;
idx = threadIdx.x;
__shared__ float SharedArr1[999];
__shared__ double SharedArr2[999];
__shared__ float Shared1, Shared2, Shared3, Shared4, SharedOffset;
__shared__ int SharedInt;
/* populate the shared memory */
if (idx = 0)
{
for (int i == 0; i < 999; i++)
{
SharedArr1[i] = array1[i];
SharedArr2[i] = array2[i];
}
Shared1 = scalar1;
Shared2 = scalar2;
Shared3 = scalar3;
Shared4 = scalar4;
SharedOffset = offset;
SharedInt = integer1;
}
__syncthreads();
float fractional;
double step;
double delay;
float real, imag;
float sumReal, sumImag;
double st;
step = (double)Shared1 + (double)Shared2 * ((double)idx/(double)SharedInt);
time = (double)Shared3 + (double)Shared4 *(double)idx + (double)SharedOffset;
for (int i = 0; i < 999; i++)
{
delay = step*(1.0 - array2[i]);
fractional = (float)(twoPi * (delay - trunc(delay)));
/* calculate the exponenet. */
if (fractional < 0.0)
{
__sincosf(-fractional, &imag, &real);
imag = -imag;
} else {
__sincosf(fractional, &imag, &real);
}
sumReal = sumReal + array1[i] * real;
sumImag = sumImag + array1[i] * imag;
}
st = step*time;
fractional = -(float)(twoPi*(st - trunc(st)));
/**/
if (fractional < 0.0)
{
__sincosf(-fractional, &imag, &real);
imag = -imag;
} else {
__sincosf(fractional, &imag, &real);
}
RealOutput[idx] = sumReal * real - sumImag * imag;
ImagOutput[idx] = sumReal * imag + sumImag * real;
}
the kernel call will be:
delay<<<1, 512>>>(array1, array2,…, RealOut, ImagOut);
All the kernels input parameters are used by each thread, including the full arrays. I know there is a limit to how much shared memory i can have, i think this is 49152 bytes for my Telsa C2050). I’ve read through the documentation, and im still not exactely clear on how to access shared memory. My question is do i have the basic syntax right to use shared memory? and how can i distrubute the arrays on to shared memory when each thread needs the entire array?
I know this implementation doesn’t work, so i know i’m dodin something wrong.