Shared memory example
Im trying to understand how to use shared memory properly. I'm not sure i quite understand how to use shared memory for my application to make things faster.
I have a written up a somewhat generic kernel based on my application and anotehr kernel with what i think needs to be done to implement the shared memory part.

non shared memory example
[code]
#define twoPi 6.283185307179586

__global__ void delay(
const float *array1,
const double *array2,
const float scalar1,
const float scalar2,
const float scalar3,
const float scalar4,
const floar offset,
const int integer1,
float *RealOutput,
float *ImagOuptut)
{

int idx;
idx = threadIdx.x;

float fractional;
double step;
double delay;
float real, imag;
float sumReal, sumImag;
double st;

step = (double)scalar1 + (double)scalar2 * ((double)idx/(double)integer1);
time = (double)scalar3 + (double)scalar4 *(double)idx + (double)offset;

for (int i = 0; i < 999; i++)
{
delay = step*(1.0 - array2[i]);
fractional = (float)(twoPi * (delay - trunc(delay)));

/* calculate the exponenet. */
if (fractional < 0.0)
{
__sincosf(-fractional, &imag, &real);
imag = -imag;
} else {
__sincosf(fractional, &imag, &real);
}
sumReal = sumReal + array1[i] * real;
sumImag = sumImag + array1[i] * imag;
}

st = step*time;
fractional = -(float)(twoPi*(st - trunc(st)));

if (fractional < 0.0)
{
__sincosf(-fractional, &imag, &real);
imag = -imag;
} else {
__sincosf(fractional, &imag, &real);
}

RealOutput[idx] = sumReal * real - sumImag * imag;
ImagOutput[idx] = sumReal * imag + sumImag * real;
}


[/code]

Shared memory example
[code]
#define twoPi 6.283185307179586

__global__ void delay(
const float *array1,
const double *array2,
const float scalar1,
const float scalar2,
const float scalar3,
const float scalar4,
const float offset,
const int integer1,
float *RealOutput,
float *ImagOuptut)
{

int idx;
idx = threadIdx.x;

__shared__ float SharedArr1[999];
__shared__ double SharedArr2[999];
__shared__ float Shared1, Shared2, Shared3, Shared4, SharedOffset;
__shared__ int SharedInt;

/* populate the shared memory */
if (idx = 0)
{
for (int i == 0; i < 999; i++)
{
SharedArr1[i] = array1[i];
SharedArr2[i] = array2[i];
}
Shared1 = scalar1;
Shared2 = scalar2;
Shared3 = scalar3;
Shared4 = scalar4;
SharedOffset = offset;
SharedInt = integer1;
}
__syncthreads();


float fractional;
double step;
double delay;
float real, imag;
float sumReal, sumImag;
double st;

step = (double)Shared1 + (double)Shared2 * ((double)idx/(double)SharedInt);
time = (double)Shared3 + (double)Shared4 *(double)idx + (double)SharedOffset;

for (int i = 0; i < 999; i++)
{
delay = step*(1.0 - array2[i]);
fractional = (float)(twoPi * (delay - trunc(delay)));

/* calculate the exponenet. */
if (fractional < 0.0)
{
__sincosf(-fractional, &imag, &real);
imag = -imag;
} else {
__sincosf(fractional, &imag, &real);
}
sumReal = sumReal + array1[i] * real;
sumImag = sumImag + array1[i] * imag;
}

st = step*time;
fractional = -(float)(twoPi*(st - trunc(st)));

/**/
if (fractional < 0.0)
{
__sincosf(-fractional, &imag, &real);
imag = -imag;
} else {
__sincosf(fractional, &imag, &real);
}

RealOutput[idx] = sumReal * real - sumImag * imag;
ImagOutput[idx] = sumReal * imag + sumImag * real;
}

[/code]

the kernel call will be:
delay<<<1, 512>>>(array1, array2,...., RealOut, ImagOut);

All the kernels input parameters are used by each thread, including the full arrays. I know there is a limit to how much shared memory i can have, i think this is 49152 bytes for my Telsa C2050). I've read through the documentation, and im still not exactely clear on how to access shared memory. My question is do i have the basic syntax right to use shared memory? and how can i distrubute the arrays on to shared memory when each thread needs the entire array?
I know this implementation doesn't work, so i know i'm dodin something wrong.
Im trying to understand how to use shared memory properly. I'm not sure i quite understand how to use shared memory for my application to make things faster.

I have a written up a somewhat generic kernel based on my application and anotehr kernel with what i think needs to be done to implement the shared memory part.



non shared memory example



#define twoPi 6.283185307179586



__global__ void delay(

const float *array1,

const double *array2,

const float scalar1,

const float scalar2,

const float scalar3,

const float scalar4,

const floar offset,

const int integer1,

float *RealOutput,

float *ImagOuptut)

{



int idx;

idx = threadIdx.x;



float fractional;

double step;

double delay;

float real, imag;

float sumReal, sumImag;

double st;



step = (double)scalar1 + (double)scalar2 * ((double)idx/(double)integer1);

time = (double)scalar3 + (double)scalar4 *(double)idx + (double)offset;



for (int i = 0; i < 999; i++)

{

delay = step*(1.0 - array2[i]);

fractional = (float)(twoPi * (delay - trunc(delay)));



/* calculate the exponenet. */

if (fractional < 0.0)

{

__sincosf(-fractional, &imag, &real);

imag = -imag;

} else {

__sincosf(fractional, &imag, &real);

}

sumReal = sumReal + array1[i] * real;

sumImag = sumImag + array1[i] * imag;

}



st = step*time;

fractional = -(float)(twoPi*(st - trunc(st)));



if (fractional < 0.0)

{

__sincosf(-fractional, &imag, &real);

imag = -imag;

} else {

__sincosf(fractional, &imag, &real);

}



RealOutput[idx] = sumReal * real - sumImag * imag;

ImagOutput[idx] = sumReal * imag + sumImag * real;

}








Shared memory example



#define twoPi 6.283185307179586



__global__ void delay(

const float *array1,

const double *array2,

const float scalar1,

const float scalar2,

const float scalar3,

const float scalar4,

const float offset,

const int integer1,

float *RealOutput,

float *ImagOuptut)

{



int idx;

idx = threadIdx.x;



__shared__ float SharedArr1[999];

__shared__ double SharedArr2[999];

__shared__ float Shared1, Shared2, Shared3, Shared4, SharedOffset;

__shared__ int SharedInt;



/* populate the shared memory */

if (idx = 0)

{

for (int i == 0; i < 999; i++)

{

SharedArr1[i] = array1[i];

SharedArr2[i] = array2[i];

}

Shared1 = scalar1;

Shared2 = scalar2;

Shared3 = scalar3;

Shared4 = scalar4;

SharedOffset = offset;

SharedInt = integer1;

}

__syncthreads();





float fractional;

double step;

double delay;

float real, imag;

float sumReal, sumImag;

double st;



step = (double)Shared1 + (double)Shared2 * ((double)idx/(double)SharedInt);

time = (double)Shared3 + (double)Shared4 *(double)idx + (double)SharedOffset;



for (int i = 0; i < 999; i++)

{

delay = step*(1.0 - array2[i]);

fractional = (float)(twoPi * (delay - trunc(delay)));



/* calculate the exponenet. */

if (fractional < 0.0)

{

__sincosf(-fractional, &imag, &real);

imag = -imag;

} else {

__sincosf(fractional, &imag, &real);

}

sumReal = sumReal + array1[i] * real;

sumImag = sumImag + array1[i] * imag;

}



st = step*time;

fractional = -(float)(twoPi*(st - trunc(st)));



/**/

if (fractional < 0.0)

{

__sincosf(-fractional, &imag, &real);

imag = -imag;

} else {

__sincosf(fractional, &imag, &real);

}



RealOutput[idx] = sumReal * real - sumImag * imag;

ImagOutput[idx] = sumReal * imag + sumImag * real;

}






the kernel call will be:

delay<<<1, 512>>>(array1, array2,...., RealOut, ImagOut);



All the kernels input parameters are used by each thread, including the full arrays. I know there is a limit to how much shared memory i can have, i think this is 49152 bytes for my Telsa C2050). I've read through the documentation, and im still not exactely clear on how to access shared memory. My question is do i have the basic syntax right to use shared memory? and how can i distrubute the arrays on to shared memory when each thread needs the entire array?

I know this implementation doesn't work, so i know i'm dodin something wrong.

Physics is like sex,

Sure it has practical applications, but thats not why we do it.

- R. Feynman.

#1
Posted 04/24/2012 03:12 AM   
Some comments:

When you do the initialization, you should use all threads to copy the data from global memory to shared memory, i.e.[code]for (int i = threadIdx.x; i < 999; i+=blockDim.x)
{
SharedArr1[i] = array1[i];
SharedArr2[i] = array2[i];
}[/code]
and remove the if-statement that selects thread 0. Note also that in your code, you use == instead of = in the for-loop. Shared memory is useful to cache data from global memory. Storing the function parameters is shared memory however should not be necessary.

You are actually not using array1 and array2 instead of SharedArr1 and SharedArr2 in your code.
Some comments:



When you do the initialization, you should use all threads to copy the data from global memory to shared memory, i.e.
for (int i = threadIdx.x; i < 999; i+=blockDim.x)

{

SharedArr1[i] = array1[i];

SharedArr2[i] = array2[i];

}


and remove the if-statement that selects thread 0. Note also that in your code, you use == instead of = in the for-loop. Shared memory is useful to cache data from global memory. Storing the function parameters is shared memory however should not be necessary.



You are actually not using array1 and array2 instead of SharedArr1 and SharedArr2 in your code.

#2
Posted 04/24/2012 08:46 AM   
Scroll To Top