cuda shared memory usage + no reduction with threads

cuda_hpc80 · April 22, 2012, 8:52pm

Looking at Mark Harris’s reduction example, I am trying to see if I can have threads store intermediate values without reduction operation:

For example CPU code:

for(int i = 0; i < ntr; i++)

    {

        for(int j = 0; j < pos* posdir; j++)

        {

            val = x[i] * arr[j];

            if(val > 0.0)

            {

                out[xcount] = val*x[i];

                xcount += 1;

            }

        }

    }

Equivalent GPU code:

const int threads = 64;	

num_blocks = ntr/threads;

__global__ void test_g(float *in1, float *in2, float *out1, int *ct, int posdir, int pos)

{

    int tid = threadIdx.x + blockIdx.x*blockDim.x;

    __shared__ float t1[threads];

    __shared__ float t2[threads];

int gcount  = 0;

for(int i = 0; i < posdir*pos; i += 32) {

        if (threadIdx.x < 32) {

            t1[threadIdx.x] = in2[i%posdir];

        }

       __syncthreads();

for(int i = 0; i < 32; i++)

        {

            t2[i] = t1[i] * in1[tid];

                if(t2[i] > 0){

                    out1[gcount] = t2[i] * in1[tid];

                    gcount = gcount + 1;

                }

        }

    }        

    ct[0] = gcount;

}

As I debug, I find that for block (0,0,0) and thread(0,0,0) I can sequentially see the values of t2 updated. After the CUDA kernel switches focus to block(0,0,0) and thread(32,0,0), the values of out1[0] are re-written again. How can I get/store the values of out1 for each thread and write it to the output?

Any suggestions? Thanks in advance !

pasoleatis · April 22, 2012, 9:33pm

Looking at Mark Harris’s reduction example, I am trying to see if I can have threads store intermediate values without reduction operation:

For example CPU code:
for(int i = 0; i < ntr; i++)

    {

        for(int j = 0; j < pos* posdir; j++)

        {

            val = x[i] * arr[j];

            if(val > 0.0)

            {

                out[xcount] = val*x[i];

                xcount += 1;

            }

        }

    }
Equivalent GPU code:
const int threads = 64;	

num_blocks = ntr/threads;

__global__ void test_g(float *in1, float *in2, float *out1, int *ct, int posdir, int pos)

{

    int tid = threadIdx.x + blockIdx.x*blockDim.x;

    __shared__ float t1[threads];

    __shared__ float t2[threads];

int gcount  = 0;

for(int i = 0; i < posdir*pos; i += 32) {

        if (threadIdx.x < 32) {

            t1[threadIdx.x] = in2[i%posdir];

        }

       __syncthreads();

for(int i = 0; i < 32; i++)

        {

            t2[i] = t1[i] * in1[tid];

                if(t2[i] > 0){

                    out1[gcount] = t2[i] * in1[tid];

                    gcount = gcount + 1;

                }

        }

    }        

    ct[0] = gcount;

}
As I debug, I find that for block (0,0,0) and thread(0,0,0) I can sequentially see the values of t2 updated. After the CUDA kernel switches focus to block(0,0,0) and thread(32,0,0), the values of out1[0] are re-written again. How can I get/store the values of out1 for each thread and write it to the output?

Any suggestions? Thanks in advance !

Define offset=tid*32; and replace out1[gcount] with out1[offset+gcount]

=

cuda_hpc80 · April 23, 2012, 1:28pm

what I am trying to do here is the following steps:

(1)Store 32 values of in2 in shared memory variable t1,

(2)For each value of i and in1[tid], calculate t2[i],

(3)if t2[i] > 0 for that particular combination of i, write t2[i]*in1[tid] to out1[offset+gcount]

But my output is all wrong. I am not even able to get a count of all the times t2[i] is greater than 0.

Any suggestions on how to save the value of gcount for each i and tid ??

const int threads = 32; 

num_blocks = ntr/threads;

__global__ void test_g(float *in1, float *in2, float *out1, int *ct, int posdir, int pos)

{

    int tid = threadIdx.x + blockIdx.x*blockDim.x;

    __shared__ float t1[threads];

    __shared__ float t2[threads];

int gcount  = 0;

    int offset = tid*32;

for(int i = 0; i < posdir*pos; i += 32) {

        if (threadIdx.x < 32) {

            t1[threadIdx.x] = in2[i%posdir];

        }

       __syncthreads();

for(int i = 0; i < 32; i++)

        {

            t2[i] = t1[i] * in1[tid];

                if(t2[i] > 0){

                    out1[offset+gcount] = t2[i] * in1[tid];

                    gcount = gcount + 1;

                }

        }

    }        

    ct[0] = gcount;

}

for example: in the CPU sequential code:

for(int i = 0; i < ntr; ++i)

	{

		x[i] = i; 

	}

	

	for(int i = 0; i < pos*posdir; ++i)

	{

		arr[i] = -i; 

	}

arr[1] = 1;

for(int i = 0; i < ntr; i++)

    {

        for(int j = 0; j < pos* posdir; j++)

        {

            val = x[i] * arr[j];

            if(val > 0.0)

            {

                out[xcount] = val*x[i];

                xcount += 1;

            }

        }

    }

My output for (ntr = 128;pos = 1;posdir = 32;)

1.000000

4.000000

9.000000

16.000000

25.000000

36.000000

49.000000

64.000000

81.000000

100.000000

121.000000

144.000000

169.000000

196.000000

225.000000

256.000000

289.000000

324.000000

361.000000

400.000000

441.000000

484.000000

529.000000

576.000000

625.000000

676.000000

729.000000

784.000000

841.000000

900.000000

961.000000

1024.000000

1089.000000

1156.000000

1225.000000

1296.000000

1369.000000

1444.000000

1521.000000

1600.000000

1681.000000

1764.000000

1849.000000

1936.000000

2025.000000

2116.000000

2209.000000

2304.000000

2401.000000

2500.000000

2601.000000

2704.000000

2809.000000

2916.000000

3025.000000

3136.000000

3249.000000

3364.000000

3481.000000

3600.000000

3721.000000

3844.000000

3969.000000

4096.000000

4225.000000

4356.000000

4489.000000

4624.000000

4761.000000

4900.000000

5041.000000

5184.000000

5329.000000

5476.000000

5625.000000

5776.000000

5929.000000

6084.000000

6241.000000

6400.000000

6561.000000

6724.000000

6889.000000

7056.000000

7225.000000

7396.000000

7569.000000

7744.000000

7921.000000

8100.000000

8281.000000

8464.000000

8649.000000

8836.000000

9025.000000

9216.000000

9409.000000

9604.000000

9801.000000

10000.000000

10201.000000

10404.000000

10609.000000

10816.000000

11025.000000

11236.000000

11449.000000

11664.000000

11881.000000

12100.000000

12321.000000

12544.000000

12769.000000

12996.000000

13225.000000

13456.000000

13689.000000

13924.000000

14161.000000

14400.000000

14641.000000

14884.000000

15129.000000

15376.000000

15625.000000

15876.000000

16129.000000

xcount = 127

Using the suggestion of offset+gcount gives me the answer as:

#blocks = 4, threads = 32

1.000000

0.000000

0.000000

1.000000

1.000000

0.000000

1.000000

1.000000

1.000000

0.000000

1.000000

0.000000

1.000000

0.000000

0.000000

1.000000

1.000000

1.000000

0.000000

0.000000

0.000000

1.000000

0.000000

0.000000

0.000000

1.000000

1.000000

0.000000

1.000000

1.000000

1.000000

0.000000

0.000000

1.000000

1.000000

1.000000

1.000000

1.000000

1.000000

1.000000

1.000000

0.000000

1.000000

0.000000

0.000000

1.000000

1.000000

1.000000

0.000000

0.000000

0.000000

0.000000

1.000000

0.000000

0.000000

0.000000

0.000000

1.000000

0.000000

1.000000

1.000000

1.000000

0.000000

1.000000

1.000000

1.000000

0.000000

0.000000

0.000000

1.000000

1.000000

1.000000

1.000000

0.000000

0.000000

1.000000

1.000000

1.000000

1.000000

0.000000

1.000000

1.000000

0.000000

1.000000

1.000000

1.000000

1.000000

1.000000

0.000000

1.000000

1.000000

1.000000

0.000000

1.000000

0.000000

1.000000

0.000000

1.000000

0.000000

1.000000

0.000000

1.000000

0.000000

0.000000

0.000000

0.000000

1.000000

1.000000

0.000000

0.000000

1.000000

1.000000

1.000000

0.000000

0.000000

1.000000

1.000000

1.000000

0.000000

1.000000

0.000000

1.000000

1.000000

1.000000

0.000000

1.000000

0.000000

GPU: xcount = 0

pasoleatis · April 23, 2012, 2:20pm

Ok. Another option is to define a global variable

__device int totgcount=0; // this line before the kernel

and in the kernel you will have

atomicAdd(&totgcount,1);

out1[totgcount]=t2[i] * in1[tid];

out[0] will be left unchanged, but now each thread which is has a value to save will add 1 to gcount and it will be saved in a separate element of the array. At the end you get tot gcount with memcpytosymbol and print ou the array out1[1:totgcount].

cuda_hpc80 · April 23, 2012, 3:01pm

Ok. Another option is to define a global variable

__device int totgcount=0; // this line before the kernel

and in the kernel you will have
atomicAdd(&totgcount,1);

out1[totgcount]=t2[i] * in1[tid];
out[0] will be left unchanged, but now each thread which is has a value to save will add 1 to gcount and it will be saved in a separate element of the array. At the end you get tot gcount with memcpytosymbol and print ou the array out1[1:totgcount].

Thanks @pasoleatis for the replies.

I did the following:

__device__ int totgcount=0; // this line before main()

const int threads = 32;	

__global__ void test_g(float *in1, float *in2, float *out1, int *ct, int posdir, int pos)

{

    int tid = threadIdx.x + blockIdx.x*blockDim.x;

    __shared__ float t1[threads];

    __shared__ float t2[threads];

int gcount = 0;

    //int offset = tid*32;

for(int i = 0; i < posdir*pos; i += 32) {

        if (threadIdx.x < 32) {

            t1[threadIdx.x] = in2[i%posdir];

        }

       __syncthreads();

for(int i = 0; i < 32; i++)

        {

            t2[i] = t1[i] * in1[tid];

                if(t2[i] > 0){

                    atomicAdd(&totgcount,1);

                    out1[totgcount]=t2[i] * in1[tid];

                    //out1[offset+gcount] = t2[i] * in1[tid];

                    gcount = gcount + 1;

                }

        }

    }        

if(tid==0)

        ct[0] = gcount;

}

    int *h_xc = (int*) malloc(sizeof(int) * 1);

    cudaMemcpyFromSymbol(h_xc, totgcount, sizeof(int)*1, cudaMemcpyDeviceToHost);

    printf("GPU: xcount = %d\n", h_xc[0]); // Output looks like this: GPU: xcount = 1928669800

pasoleatis · April 23, 2012, 3:37pm

Doing this will be terrible slow. Becaue the totgcount will serialize the opreation of saving. The fastest way I see at this moment is save it with offset 32*tid and in the end only use the non-zero elements. Good luck.

I think there might be a problem with these 2 lines, though I hope it works. With this code the data will be shuffled in the out1 array.

atomicAdd(&totgcount,1);

                    out1[totgcount]=t2[i] * in1[tid];