what I am trying to do here is the following steps:
(1)Store 32 values of in2 in shared memory variable t1,
(2)For each value of i and in1[tid], calculate t2[i],
(3)if t2[i] > 0 for that particular combination of i, write t2[i]*in1[tid] to out1[offset+gcount]
But my output is all wrong. I am not even able to get a count of all the times t2[i] is greater than 0.
Any suggestions on how to save the value of gcount for each i and tid ??
const int threads = 32;
num_blocks = ntr/threads;
__global__ void test_g(float *in1, float *in2, float *out1, int *ct, int posdir, int pos)
{
int tid = threadIdx.x + blockIdx.x*blockDim.x;
__shared__ float t1[threads];
__shared__ float t2[threads];
int gcount = 0;
int offset = tid*32;
for(int i = 0; i < posdir*pos; i += 32) {
if (threadIdx.x < 32) {
t1[threadIdx.x] = in2[i%posdir];
}
__syncthreads();
for(int i = 0; i < 32; i++)
{
t2[i] = t1[i] * in1[tid];
if(t2[i] > 0){
out1[offset+gcount] = t2[i] * in1[tid];
gcount = gcount + 1;
}
}
}
ct[0] = gcount;
}
for example: in the CPU sequential code:
for(int i = 0; i < ntr; ++i)
{
x[i] = i;
}
for(int i = 0; i < pos*posdir; ++i)
{
arr[i] = -i;
}
arr[1] = 1;
for(int i = 0; i < ntr; i++)
{
for(int j = 0; j < pos* posdir; j++)
{
val = x[i] * arr[j];
if(val > 0.0)
{
out[xcount] = val*x[i];
xcount += 1;
}
}
}
My output for (ntr = 128;pos = 1;posdir = 32;)
1.000000
4.000000
9.000000
16.000000
25.000000
36.000000
49.000000
64.000000
81.000000
100.000000
121.000000
144.000000
169.000000
196.000000
225.000000
256.000000
289.000000
324.000000
361.000000
400.000000
441.000000
484.000000
529.000000
576.000000
625.000000
676.000000
729.000000
784.000000
841.000000
900.000000
961.000000
1024.000000
1089.000000
1156.000000
1225.000000
1296.000000
1369.000000
1444.000000
1521.000000
1600.000000
1681.000000
1764.000000
1849.000000
1936.000000
2025.000000
2116.000000
2209.000000
2304.000000
2401.000000
2500.000000
2601.000000
2704.000000
2809.000000
2916.000000
3025.000000
3136.000000
3249.000000
3364.000000
3481.000000
3600.000000
3721.000000
3844.000000
3969.000000
4096.000000
4225.000000
4356.000000
4489.000000
4624.000000
4761.000000
4900.000000
5041.000000
5184.000000
5329.000000
5476.000000
5625.000000
5776.000000
5929.000000
6084.000000
6241.000000
6400.000000
6561.000000
6724.000000
6889.000000
7056.000000
7225.000000
7396.000000
7569.000000
7744.000000
7921.000000
8100.000000
8281.000000
8464.000000
8649.000000
8836.000000
9025.000000
9216.000000
9409.000000
9604.000000
9801.000000
10000.000000
10201.000000
10404.000000
10609.000000
10816.000000
11025.000000
11236.000000
11449.000000
11664.000000
11881.000000
12100.000000
12321.000000
12544.000000
12769.000000
12996.000000
13225.000000
13456.000000
13689.000000
13924.000000
14161.000000
14400.000000
14641.000000
14884.000000
15129.000000
15376.000000
15625.000000
15876.000000
16129.000000
xcount = 127
Using the suggestion of offset+gcount gives me the answer as:
#blocks = 4, threads = 32
1.000000
0.000000
0.000000
1.000000
1.000000
0.000000
1.000000
1.000000
1.000000
0.000000
1.000000
0.000000
1.000000
0.000000
0.000000
1.000000
1.000000
1.000000
0.000000
0.000000
0.000000
1.000000
0.000000
0.000000
0.000000
1.000000
1.000000
0.000000
1.000000
1.000000
1.000000
0.000000
0.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
0.000000
1.000000
0.000000
0.000000
1.000000
1.000000
1.000000
0.000000
0.000000
0.000000
0.000000
1.000000
0.000000
0.000000
0.000000
0.000000
1.000000
0.000000
1.000000
1.000000
1.000000
0.000000
1.000000
1.000000
1.000000
0.000000
0.000000
0.000000
1.000000
1.000000
1.000000
1.000000
0.000000
0.000000
1.000000
1.000000
1.000000
1.000000
0.000000
1.000000
1.000000
0.000000
1.000000
1.000000
1.000000
1.000000
1.000000
0.000000
1.000000
1.000000
1.000000
0.000000
1.000000
0.000000
1.000000
0.000000
1.000000
0.000000
1.000000
0.000000
1.000000
0.000000
0.000000
0.000000
0.000000
1.000000
1.000000
0.000000
0.000000
1.000000
1.000000
1.000000
0.000000
0.000000
1.000000
1.000000
1.000000
0.000000
1.000000
0.000000
1.000000
1.000000
1.000000
0.000000
1.000000
0.000000
GPU: xcount = 0