cuda shared memory usage + no reduction with threads
Looking at Mark Harris's reduction example, I am trying to see if I can have threads store intermediate values without reduction operation:

For example CPU code:
[code]
for(int i = 0; i < ntr; i++)
{
for(int j = 0; j < pos* posdir; j++)
{
val = x[i] * arr[j];
if(val > 0.0)
{
out[xcount] = val*x[i];
xcount += 1;
}
}
}
[/code]

Equivalent GPU code:

[code]

const int threads = 64;
num_blocks = ntr/threads;

__global__ void test_g(float *in1, float *in2, float *out1, int *ct, int posdir, int pos)
{
int tid = threadIdx.x + blockIdx.x*blockDim.x;
__shared__ float t1[threads];
__shared__ float t2[threads];

int gcount = 0;

for(int i = 0; i < posdir*pos; i += 32) {
if (threadIdx.x < 32) {
t1[threadIdx.x] = in2[i%posdir];
}
__syncthreads();

for(int i = 0; i < 32; i++)
{
t2[i] = t1[i] * in1[tid];
if(t2[i] > 0){
out1[gcount] = t2[i] * in1[tid];
gcount = gcount + 1;
}
}
}
ct[0] = gcount;

}
[/code]

As I debug, I find that for block (0,0,0) and thread(0,0,0) I can sequentially see the values of t2 updated. After the CUDA kernel switches focus to block(0,0,0) and thread(32,0,0), the values of out1[0] are re-written again. How can I get/store the values of out1 for each thread and write it to the output?

Any suggestions? Thanks in advance !
Looking at Mark Harris's reduction example, I am trying to see if I can have threads store intermediate values without reduction operation:



For example CPU code:



for(int i = 0; i < ntr; i++)

{

for(int j = 0; j < pos* posdir; j++)

{

val = x[i] * arr[j];

if(val > 0.0)

{

out[xcount] = val*x[i];

xcount += 1;

}

}

}




Equivalent GPU code:







const int threads = 64;

num_blocks = ntr/threads;



__global__ void test_g(float *in1, float *in2, float *out1, int *ct, int posdir, int pos)

{

int tid = threadIdx.x + blockIdx.x*blockDim.x;

__shared__ float t1[threads];

__shared__ float t2[threads];



int gcount = 0;



for(int i = 0; i < posdir*pos; i += 32) {

if (threadIdx.x < 32) {

t1[threadIdx.x] = in2[i%posdir];

}

__syncthreads();



for(int i = 0; i < 32; i++)

{

t2[i] = t1[i] * in1[tid];

if(t2[i] > 0){

out1[gcount] = t2[i] * in1[tid];

gcount = gcount + 1;

}

}

}

ct[0] = gcount;



}




As I debug, I find that for block (0,0,0) and thread(0,0,0) I can sequentially see the values of t2 updated. After the CUDA kernel switches focus to block(0,0,0) and thread(32,0,0), the values of out1[0] are re-written again. How can I get/store the values of out1 for each thread and write it to the output?



Any suggestions? Thanks in advance !

#1
Posted 04/22/2012 08:52 PM   
[quote name='cuda_hpc80' date='22 April 2012 - 09:52 PM' timestamp='1335127969' post='1399550']
Looking at Mark Harris's reduction example, I am trying to see if I can have threads store intermediate values without reduction operation:

For example CPU code:
[code]
for(int i = 0; i < ntr; i++)
{
for(int j = 0; j < pos* posdir; j++)
{
val = x[i] * arr[j];
if(val > 0.0)
{
out[xcount] = val*x[i];
xcount += 1;
}
}
}
[/code]

Equivalent GPU code:

[code]

const int threads = 64;
num_blocks = ntr/threads;

__global__ void test_g(float *in1, float *in2, float *out1, int *ct, int posdir, int pos)
{
int tid = threadIdx.x + blockIdx.x*blockDim.x;
__shared__ float t1[threads];
__shared__ float t2[threads];

int gcount = 0;

for(int i = 0; i < posdir*pos; i += 32) {
if (threadIdx.x < 32) {
t1[threadIdx.x] = in2[i%posdir];
}
__syncthreads();

for(int i = 0; i < 32; i++)
{
t2[i] = t1[i] * in1[tid];
if(t2[i] > 0){
out1[gcount] = t2[i] * in1[tid];
gcount = gcount + 1;
}
}
}
ct[0] = gcount;

}
[/code]

As I debug, I find that for block (0,0,0) and thread(0,0,0) I can sequentially see the values of t2 updated. After the CUDA kernel switches focus to block(0,0,0) and thread(32,0,0), the values of out1[0] are re-written again. How can I get/store the values of out1 for each thread and write it to the output?

Any suggestions? Thanks in advance !
[/quote]

Define offset=tid*32; and replace out1[gcount] with out1[offset+gcount]
=
[quote name='cuda_hpc80' date='22 April 2012 - 09:52 PM' timestamp='1335127969' post='1399550']

Looking at Mark Harris's reduction example, I am trying to see if I can have threads store intermediate values without reduction operation:



For example CPU code:



for(int i = 0; i < ntr; i++)

{

for(int j = 0; j < pos* posdir; j++)

{

val = x[i] * arr[j];

if(val > 0.0)

{

out[xcount] = val*x[i];

xcount += 1;

}

}

}




Equivalent GPU code:







const int threads = 64;

num_blocks = ntr/threads;



__global__ void test_g(float *in1, float *in2, float *out1, int *ct, int posdir, int pos)

{

int tid = threadIdx.x + blockIdx.x*blockDim.x;

__shared__ float t1[threads];

__shared__ float t2[threads];



int gcount = 0;



for(int i = 0; i < posdir*pos; i += 32) {

if (threadIdx.x < 32) {

t1[threadIdx.x] = in2[i%posdir];

}

__syncthreads();



for(int i = 0; i < 32; i++)

{

t2[i] = t1[i] * in1[tid];

if(t2[i] > 0){

out1[gcount] = t2[i] * in1[tid];

gcount = gcount + 1;

}

}

}

ct[0] = gcount;



}




As I debug, I find that for block (0,0,0) and thread(0,0,0) I can sequentially see the values of t2 updated. After the CUDA kernel switches focus to block(0,0,0) and thread(32,0,0), the values of out1[0] are re-written again. How can I get/store the values of out1 for each thread and write it to the output?



Any suggestions? Thanks in advance !





Define offset=tid*32; and replace out1[gcount] with out1[offset+gcount]

=

#2
Posted 04/22/2012 09:33 PM   
what I am trying to do here is the following steps:
(1)Store 32 values of in2 in shared memory variable t1,
(2)For each value of i and in1[tid], calculate t2[i],
(3)if t2[i] > 0 for that particular combination of i, write t2[i]*in1[tid] to out1[offset+gcount]

But my output is all wrong. I am not even able to get a count of all the times t2[i] is greater than 0.

Any suggestions on how to save the value of gcount for each i and tid ??


[code]
const int threads = 32;
num_blocks = ntr/threads;

__global__ void test_g(float *in1, float *in2, float *out1, int *ct, int posdir, int pos)
{
int tid = threadIdx.x + blockIdx.x*blockDim.x;
__shared__ float t1[threads];
__shared__ float t2[threads];

int gcount = 0;
int offset = tid*32;

for(int i = 0; i < posdir*pos; i += 32) {
if (threadIdx.x < 32) {
t1[threadIdx.x] = in2[i%posdir];
}
__syncthreads();

for(int i = 0; i < 32; i++)
{
t2[i] = t1[i] * in1[tid];
if(t2[i] > 0){
out1[offset+gcount] = t2[i] * in1[tid];
gcount = gcount + 1;
}
}
}
ct[0] = gcount;

}
[/code]

for example: in the CPU sequential code:
[code]
for(int i = 0; i < ntr; ++i)
{
x[i] = i;
}

for(int i = 0; i < pos*posdir; ++i)
{
arr[i] = -i;
}

arr[1] = 1;

for(int i = 0; i < ntr; i++)
{
for(int j = 0; j < pos* posdir; j++)
{
val = x[i] * arr[j];
if(val > 0.0)
{
out[xcount] = val*x[i];
xcount += 1;
}
}
}

[/code]

My output for (ntr = 128;pos = 1;posdir = 32;)
[code]
1.000000
4.000000
9.000000
16.000000
25.000000
36.000000
49.000000
64.000000
81.000000
100.000000
121.000000
144.000000
169.000000
196.000000
225.000000
256.000000
289.000000
324.000000
361.000000
400.000000
441.000000
484.000000
529.000000
576.000000
625.000000
676.000000
729.000000
784.000000
841.000000
900.000000
961.000000
1024.000000
1089.000000
1156.000000
1225.000000
1296.000000
1369.000000
1444.000000
1521.000000
1600.000000
1681.000000
1764.000000
1849.000000
1936.000000
2025.000000
2116.000000
2209.000000
2304.000000
2401.000000
2500.000000
2601.000000
2704.000000
2809.000000
2916.000000
3025.000000
3136.000000
3249.000000
3364.000000
3481.000000
3600.000000
3721.000000
3844.000000
3969.000000
4096.000000
4225.000000
4356.000000
4489.000000
4624.000000
4761.000000
4900.000000
5041.000000
5184.000000
5329.000000
5476.000000
5625.000000
5776.000000
5929.000000
6084.000000
6241.000000
6400.000000
6561.000000
6724.000000
6889.000000
7056.000000
7225.000000
7396.000000
7569.000000
7744.000000
7921.000000
8100.000000
8281.000000
8464.000000
8649.000000
8836.000000
9025.000000
9216.000000
9409.000000
9604.000000
9801.000000
10000.000000
10201.000000
10404.000000
10609.000000
10816.000000
11025.000000
11236.000000
11449.000000
11664.000000
11881.000000
12100.000000
12321.000000
12544.000000
12769.000000
12996.000000
13225.000000
13456.000000
13689.000000
13924.000000
14161.000000
14400.000000
14641.000000
14884.000000
15129.000000
15376.000000
15625.000000
15876.000000
16129.000000
xcount = 127
[/code]


Using the suggestion of offset+gcount gives me the answer as:
[code]
#blocks = 4, threads = 32
1.000000
0.000000
0.000000
1.000000
1.000000
0.000000
1.000000
1.000000
1.000000
0.000000
1.000000
0.000000
1.000000
0.000000
0.000000
1.000000
1.000000
1.000000
0.000000
0.000000
0.000000
1.000000
0.000000
0.000000
0.000000
1.000000
1.000000
0.000000
1.000000
1.000000
1.000000
0.000000
0.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
0.000000
1.000000
0.000000
0.000000
1.000000
1.000000
1.000000
0.000000
0.000000
0.000000
0.000000
1.000000
0.000000
0.000000
0.000000
0.000000
1.000000
0.000000
1.000000
1.000000
1.000000
0.000000
1.000000
1.000000
1.000000
0.000000
0.000000
0.000000
1.000000
1.000000
1.000000
1.000000
0.000000
0.000000
1.000000
1.000000
1.000000
1.000000
0.000000
1.000000
1.000000
0.000000
1.000000
1.000000
1.000000
1.000000
1.000000
0.000000
1.000000
1.000000
1.000000
0.000000
1.000000
0.000000
1.000000
0.000000
1.000000
0.000000
1.000000
0.000000
1.000000
0.000000
0.000000
0.000000
0.000000
1.000000
1.000000
0.000000
0.000000
1.000000
1.000000
1.000000
0.000000
0.000000
1.000000
1.000000
1.000000
0.000000
1.000000
0.000000
1.000000
1.000000
1.000000
0.000000
1.000000
0.000000
GPU: xcount = 0
[/code]
what I am trying to do here is the following steps:

(1)Store 32 values of in2 in shared memory variable t1,

(2)For each value of i and in1[tid], calculate t2[i],

(3)if t2[i] > 0 for that particular combination of i, write t2[i]*in1[tid] to out1[offset+gcount]



But my output is all wrong. I am not even able to get a count of all the times t2[i] is greater than 0.



Any suggestions on how to save the value of gcount for each i and tid ??







const int threads = 32;

num_blocks = ntr/threads;



__global__ void test_g(float *in1, float *in2, float *out1, int *ct, int posdir, int pos)

{

int tid = threadIdx.x + blockIdx.x*blockDim.x;

__shared__ float t1[threads];

__shared__ float t2[threads];



int gcount = 0;

int offset = tid*32;



for(int i = 0; i < posdir*pos; i += 32) {

if (threadIdx.x < 32) {

t1[threadIdx.x] = in2[i%posdir];

}

__syncthreads();



for(int i = 0; i < 32; i++)

{

t2[i] = t1[i] * in1[tid];

if(t2[i] > 0){

out1[offset+gcount] = t2[i] * in1[tid];

gcount = gcount + 1;

}

}

}

ct[0] = gcount;



}




for example: in the CPU sequential code:



for(int i = 0; i < ntr; ++i)

{

x[i] = i;

}



for(int i = 0; i < pos*posdir; ++i)

{

arr[i] = -i;

}



arr[1] = 1;



for(int i = 0; i < ntr; i++)

{

for(int j = 0; j < pos* posdir; j++)

{

val = x[i] * arr[j];

if(val > 0.0)

{

out[xcount] = val*x[i];

xcount += 1;

}

}

}






My output for (ntr = 128;pos = 1;posdir = 32;)



1.000000

4.000000

9.000000

16.000000

25.000000

36.000000

49.000000

64.000000

81.000000

100.000000

121.000000

144.000000

169.000000

196.000000

225.000000

256.000000

289.000000

324.000000

361.000000

400.000000

441.000000

484.000000

529.000000

576.000000

625.000000

676.000000

729.000000

784.000000

841.000000

900.000000

961.000000

1024.000000

1089.000000

1156.000000

1225.000000

1296.000000

1369.000000

1444.000000

1521.000000

1600.000000

1681.000000

1764.000000

1849.000000

1936.000000

2025.000000

2116.000000

2209.000000

2304.000000

2401.000000

2500.000000

2601.000000

2704.000000

2809.000000

2916.000000

3025.000000

3136.000000

3249.000000

3364.000000

3481.000000

3600.000000

3721.000000

3844.000000

3969.000000

4096.000000

4225.000000

4356.000000

4489.000000

4624.000000

4761.000000

4900.000000

5041.000000

5184.000000

5329.000000

5476.000000

5625.000000

5776.000000

5929.000000

6084.000000

6241.000000

6400.000000

6561.000000

6724.000000

6889.000000

7056.000000

7225.000000

7396.000000

7569.000000

7744.000000

7921.000000

8100.000000

8281.000000

8464.000000

8649.000000

8836.000000

9025.000000

9216.000000

9409.000000

9604.000000

9801.000000

10000.000000

10201.000000

10404.000000

10609.000000

10816.000000

11025.000000

11236.000000

11449.000000

11664.000000

11881.000000

12100.000000

12321.000000

12544.000000

12769.000000

12996.000000

13225.000000

13456.000000

13689.000000

13924.000000

14161.000000

14400.000000

14641.000000

14884.000000

15129.000000

15376.000000

15625.000000

15876.000000

16129.000000

xcount = 127






Using the suggestion of offset+gcount gives me the answer as:



#blocks = 4, threads = 32

1.000000

0.000000

0.000000

1.000000

1.000000

0.000000

1.000000

1.000000

1.000000

0.000000

1.000000

0.000000

1.000000

0.000000

0.000000

1.000000

1.000000

1.000000

0.000000

0.000000

0.000000

1.000000

0.000000

0.000000

0.000000

1.000000

1.000000

0.000000

1.000000

1.000000

1.000000

0.000000

0.000000

1.000000

1.000000

1.000000

1.000000

1.000000

1.000000

1.000000

1.000000

0.000000

1.000000

0.000000

0.000000

1.000000

1.000000

1.000000

0.000000

0.000000

0.000000

0.000000

1.000000

0.000000

0.000000

0.000000

0.000000

1.000000

0.000000

1.000000

1.000000

1.000000

0.000000

1.000000

1.000000

1.000000

0.000000

0.000000

0.000000

1.000000

1.000000

1.000000

1.000000

0.000000

0.000000

1.000000

1.000000

1.000000

1.000000

0.000000

1.000000

1.000000

0.000000

1.000000

1.000000

1.000000

1.000000

1.000000

0.000000

1.000000

1.000000

1.000000

0.000000

1.000000

0.000000

1.000000

0.000000

1.000000

0.000000

1.000000

0.000000

1.000000

0.000000

0.000000

0.000000

0.000000

1.000000

1.000000

0.000000

0.000000

1.000000

1.000000

1.000000

0.000000

0.000000

1.000000

1.000000

1.000000

0.000000

1.000000

0.000000

1.000000

1.000000

1.000000

0.000000

1.000000

0.000000

GPU: xcount = 0

#3
Posted 04/23/2012 01:28 PM   
Ok. Another option is to define a global variable

__device int totgcount=0; // this line before the kernel

and in the kernel you will have

[code]
atomicAdd(&totgcount,1);
out1[totgcount]=t2[i] * in1[tid];
[/code]

out[0] will be left unchanged, but now each thread which is has a value to save will add 1 to gcount and it will be saved in a separate element of the array. At the end you get tot gcount with memcpytosymbol and print ou the array out1[1:totgcount].
Ok. Another option is to define a global variable



__device int totgcount=0; // this line before the kernel



and in the kernel you will have





atomicAdd(&totgcount,1);

out1[totgcount]=t2[i] * in1[tid];




out[0] will be left unchanged, but now each thread which is has a value to save will add 1 to gcount and it will be saved in a separate element of the array. At the end you get tot gcount with memcpytosymbol and print ou the array out1[1:totgcount].

#4
Posted 04/23/2012 02:20 PM   
[quote name='pasoleatis' date='23 April 2012 - 10:20 AM' timestamp='1335190848' post='1399856']
Ok. Another option is to define a global variable

__device int totgcount=0; // this line before the kernel

and in the kernel you will have

[code]
atomicAdd(&totgcount,1);
out1[totgcount]=t2[i] * in1[tid];
[/code]

out[0] will be left unchanged, but now each thread which is has a value to save will add 1 to gcount and it will be saved in a separate element of the array. At the end you get tot gcount with memcpytosymbol and print ou the array out1[1:totgcount].
[/quote]

Thanks @pasoleatis for the replies.

I did the following:
[code]__device__ int totgcount=0; // this line before main()
const int threads = 32;
__global__ void test_g(float *in1, float *in2, float *out1, int *ct, int posdir, int pos)
{
int tid = threadIdx.x + blockIdx.x*blockDim.x;
__shared__ float t1[threads];
__shared__ float t2[threads];

int gcount = 0;
//int offset = tid*32;

for(int i = 0; i < posdir*pos; i += 32) {
if (threadIdx.x < 32) {
t1[threadIdx.x] = in2[i%posdir];
}
__syncthreads();

for(int i = 0; i < 32; i++)
{
t2[i] = t1[i] * in1[tid];
if(t2[i] > 0){
atomicAdd(&totgcount,1);
out1[totgcount]=t2[i] * in1[tid];
//out1[offset+gcount] = t2[i] * in1[tid];
gcount = gcount + 1;
}
}
}

if(tid==0)
ct[0] = gcount;

}
int *h_xc = (int*) malloc(sizeof(int) * 1);
cudaMemcpyFromSymbol(h_xc, totgcount, sizeof(int)*1, cudaMemcpyDeviceToHost);
printf("GPU: xcount = %d\n", h_xc[0]); // Output looks like this: GPU: xcount = 1928669800

[/code]
[quote name='pasoleatis' date='23 April 2012 - 10:20 AM' timestamp='1335190848' post='1399856']

Ok. Another option is to define a global variable



__device int totgcount=0; // this line before the kernel



and in the kernel you will have





atomicAdd(&totgcount,1);

out1[totgcount]=t2[i] * in1[tid];




out[0] will be left unchanged, but now each thread which is has a value to save will add 1 to gcount and it will be saved in a separate element of the array. At the end you get tot gcount with memcpytosymbol and print ou the array out1[1:totgcount].





Thanks @pasoleatis for the replies.



I did the following:

__device__ int totgcount=0; // this line before main()

const int threads = 32;

__global__ void test_g(float *in1, float *in2, float *out1, int *ct, int posdir, int pos)

{

int tid = threadIdx.x + blockIdx.x*blockDim.x;

__shared__ float t1[threads];

__shared__ float t2[threads];



int gcount = 0;

//int offset = tid*32;



for(int i = 0; i < posdir*pos; i += 32) {

if (threadIdx.x < 32) {

t1[threadIdx.x] = in2[i%posdir];

}

__syncthreads();



for(int i = 0; i < 32; i++)

{

t2[i] = t1[i] * in1[tid];

if(t2[i] > 0){

atomicAdd(&totgcount,1);

out1[totgcount]=t2[i] * in1[tid];

//out1[offset+gcount] = t2[i] * in1[tid];

gcount = gcount + 1;

}

}

}



if(tid==0)

ct[0] = gcount;



}

int *h_xc = (int*) malloc(sizeof(int) * 1);

cudaMemcpyFromSymbol(h_xc, totgcount, sizeof(int)*1, cudaMemcpyDeviceToHost);

printf("GPU: xcount = %d\n", h_xc[0]); // Output looks like this: GPU: xcount = 1928669800


#5
Posted 04/23/2012 03:01 PM   
Doing this will be terrible slow. Becaue the totgcount will serialize the opreation of saving. The fastest way I see at this moment is save it with offset 32*tid and in the end only use the non-zero elements. Good luck.

I think there might be a problem with these 2 lines, though I hope it works. With this code the data will be shuffled in the out1 array.
[code]
atomicAdd(&totgcount,1);
out1[totgcount]=t2[i] * in1[tid];
[/code]
Doing this will be terrible slow. Becaue the totgcount will serialize the opreation of saving. The fastest way I see at this moment is save it with offset 32*tid and in the end only use the non-zero elements. Good luck.



I think there might be a problem with these 2 lines, though I hope it works. With this code the data will be shuffled in the out1 array.



atomicAdd(&totgcount,1);

out1[totgcount]=t2[i] * in1[tid];

#6
Posted 04/23/2012 03:37 PM   
Scroll To Top