Problem with 2-dimensional thread blocks
Hi all!
I have some troubles with 2-dimensional thread blocks. In fact it seems that I can't use the threadIdx.y coordinate. The following example shows my issue.

[b]bi_thread_block.cu:[/b]

#define SIZE 10
#include <stdio.h>

// Kernel definition
__global__ void add(int* device)
{
int i =5*threadIdx.y + threadIdx.x;
device[i] = i;
}

int main()
{
int A[SIZE]={0};
int *devPtrA;

int memsize= SIZE * sizeof(int);

cudaMalloc((void**)&devPtrA, memsize);
cudaMemcpy(devPtrA, A, memsize, cudaMemcpyHostToDevice);

for (int i=0; i<SIZE; i++)
printf("A[%d]=%d\n",i,A[i]);

printf("\n");

add<<<2, 5>>>(devPtrA);

cudaMemcpy(A, devPtrA, memsize, cudaMemcpyDeviceToHost);

for (int i=0; i<SIZE; i++)
printf("A[%d]=%d\n",i,A[i]);

cudaFree(devPtrA);
return 0;
}

[b]Command I use to compile:[/b]
nvcc -o bi_thread_block bi_thread_block.cu


[b]./bi_thread_block output:[/b]
A[0]=0
A[1]=0
A[2]=0
A[3]=0
A[4]=0
A[5]=0
A[6]=0
A[7]=0
A[8]=0
A[9]=0

A[0]=0
A[1]=1
A[2]=2
A[3]=3
A[4]=4
A[5]=0
A[6]=0
A[7]=0
A[8]=0
A[9]=0

The first 5 elements are modified by "add", while the other 5 are not. I've also tried to use only the y coordinate calling add<<<10, 1>>>(devPtrA) and changing "add" to

__global__ void add(int* device)
{
int i =threadIdx.y;
device[i] = i;
}

but it doesn't work either. Does anyone have any idea?
Thanks a lot!
Giacomo
Hi all!

I have some troubles with 2-dimensional thread blocks. In fact it seems that I can't use the threadIdx.y coordinate. The following example shows my issue.



bi_thread_block.cu:



#define SIZE 10

#include <stdio.h>



// Kernel definition

__global__ void add(int* device)

{

int i =5*threadIdx.y + threadIdx.x;

device[i] = i;

}



int main()

{

int A[SIZE]={0};

int *devPtrA;



int memsize= SIZE * sizeof(int);



cudaMalloc((void**)&devPtrA, memsize);

cudaMemcpy(devPtrA, A, memsize, cudaMemcpyHostToDevice);



for (int i=0; i<SIZE; i++)

printf("A[%d]=%d\n",i,A[i]);



printf("\n");



add<<<2, 5>>>(devPtrA);



cudaMemcpy(A, devPtrA, memsize, cudaMemcpyDeviceToHost);



for (int i=0; i<SIZE; i++)

printf("A[%d]=%d\n",i,A[i]);



cudaFree(devPtrA);

return 0;

}



Command I use to compile:

nvcc -o bi_thread_block bi_thread_block.cu





./bi_thread_block output:

A[0]=0

A[1]=0

A[2]=0

A[3]=0

A[4]=0

A[5]=0

A[6]=0

A[7]=0

A[8]=0

A[9]=0



A[0]=0

A[1]=1

A[2]=2

A[3]=3

A[4]=4

A[5]=0

A[6]=0

A[7]=0

A[8]=0

A[9]=0



The first 5 elements are modified by "add", while the other 5 are not. I've also tried to use only the y coordinate calling add<<<10, 1>>>(devPtrA) and changing "add" to



__global__ void add(int* device)

{

int i =threadIdx.y;

device[i] = i;

}



but it doesn't work either. Does anyone have any idea?

Thanks a lot!

Giacomo

#1
Posted 03/20/2012 03:32 PM   
This line is wrong:
int i =5*threadIdx.y + threadIdx.x;

This only gives you the index of the thread in a block

it should be:
[code]
int index_x = blockIdx.x * blockDim.x + threadIdx.x;
int index_y = blockIdx.y * blockDim.y + threadIdx.y;

// map the two 2D indices to a single linear, 1D index
int grid_width = gridDim.x * blockDim.x;
int i = index_y * grid_width + index_x;
[/code]

ripped from here http://code.google.com/p/stanford-cs193g-sp2010/wiki/TutorialMultidimensionalKernelLaunch
This line is wrong:

int i =5*threadIdx.y + threadIdx.x;



This only gives you the index of the thread in a block



it should be:



int index_x = blockIdx.x * blockDim.x + threadIdx.x;

int index_y = blockIdx.y * blockDim.y + threadIdx.y;



// map the two 2D indices to a single linear, 1D index

int grid_width = gridDim.x * blockDim.x;

int i = index_y * grid_width + index_x;




ripped from here http://code.google.com/p/stanford-cs193g-sp2010/wiki/TutorialMultidimensionalKernelLaunch

#2
Posted 03/20/2012 03:58 PM   
Thank you pasoleatis! You're right!
Thank you pasoleatis! You're right!

#3
Posted 03/20/2012 04:05 PM   
Actually the problem is not really solved because I still can't use the y coordinate. In the following example I investigate the behaviour of index_x, index_y and index_y * grid_width + index_x:

[b]bi_thread_block.cu:[/b]

// Kernel definition
__global__ void add(int* device, int* ydevice, int* xdevice)
{
int index_x = blockIdx.x * blockDim.x + threadIdx.x;
int index_y = blockIdx.y * blockDim.y + threadIdx.y;

// map the two 2D indices to a single linear, 1D index
int grid_width = gridDim.x * blockDim.x;
int index = index_y * grid_width + index_x;
device[index] = index;
ydevice[index_y] = index_y;
xdevice[index_x] = index_x;
}

int main()
{
int A[SIZE]={0}, B[SIZE]={0}, C[SIZE]={0};


int *devPtrA, *devPtrB, *devPtrC;

int memsize= SIZE * sizeof(int);

cudaMalloc((void**)&devPtrA, memsize);
cudaMalloc((void**)&devPtrB, memsize);
cudaMalloc((void**)&devPtrC, memsize);

cudaMemcpy(devPtrA, A, memsize, cudaMemcpyHostToDevice);
cudaMemcpy(devPtrB, B, memsize, cudaMemcpyHostToDevice);
cudaMemcpy(devPtrC, C, memsize, cudaMemcpyHostToDevice);


add<<<5, 2>>>(devPtrA, devPtrB, devPtrC);
cudaMemcpy(A, devPtrA, memsize, cudaMemcpyDeviceToHost);
cudaMemcpy(B, devPtrB, memsize, cudaMemcpyDeviceToHost);
cudaMemcpy(C, devPtrC, memsize, cudaMemcpyDeviceToHost);

for (int i=0; i<SIZE; i++)
printf("A[%d]=%d\n",i,A[i]);
for (int i=0; i<SIZE; i++)
printf("B[%d]=%d\n",i,B[i]);
for (int i=0; i<SIZE; i++)
printf("C[%d]=%d\n",i,C[i]);

cudaFree(devPtrA);
cudaFree(devPtrB);
cudaFree(devPtrC);
return 0;
}

[b]./bi_thread_block output:[/b]

A[0]=0
A[1]=1
A[2]=2
A[3]=3
A[4]=4
A[5]=5
A[6]=6
A[7]=7
A[8]=8
A[9]=9
B[0]=0
B[1]=0
B[2]=0
B[3]=0
B[4]=0
B[5]=0
B[6]=0
B[7]=0
B[8]=0
B[9]=0
C[0]=0
C[1]=1
C[2]=2
C[3]=3
C[4]=4
C[5]=5
C[6]=6
C[7]=7
C[8]=8
C[9]=9

ThreadIdx.y and index_y seem to be always 0.
Actually the problem is not really solved because I still can't use the y coordinate. In the following example I investigate the behaviour of index_x, index_y and index_y * grid_width + index_x:



bi_thread_block.cu:



// Kernel definition

__global__ void add(int* device, int* ydevice, int* xdevice)

{

int index_x = blockIdx.x * blockDim.x + threadIdx.x;

int index_y = blockIdx.y * blockDim.y + threadIdx.y;



// map the two 2D indices to a single linear, 1D index

int grid_width = gridDim.x * blockDim.x;

int index = index_y * grid_width + index_x;

device[index] = index;

ydevice[index_y] = index_y;

xdevice[index_x] = index_x;

}



int main()

{

int A[SIZE]={0}, B[SIZE]={0}, C[SIZE]={0};





int *devPtrA, *devPtrB, *devPtrC;



int memsize= SIZE * sizeof(int);



cudaMalloc((void**)&devPtrA, memsize);

cudaMalloc((void**)&devPtrB, memsize);

cudaMalloc((void**)&devPtrC, memsize);



cudaMemcpy(devPtrA, A, memsize, cudaMemcpyHostToDevice);

cudaMemcpy(devPtrB, B, memsize, cudaMemcpyHostToDevice);

cudaMemcpy(devPtrC, C, memsize, cudaMemcpyHostToDevice);





add<<<5, 2>>>(devPtrA, devPtrB, devPtrC);

cudaMemcpy(A, devPtrA, memsize, cudaMemcpyDeviceToHost);

cudaMemcpy(B, devPtrB, memsize, cudaMemcpyDeviceToHost);

cudaMemcpy(C, devPtrC, memsize, cudaMemcpyDeviceToHost);



for (int i=0; i<SIZE; i++)

printf("A[%d]=%d\n",i,A[i]);

for (int i=0; i<SIZE; i++)

printf("B[%d]=%d\n",i,B[i]);

for (int i=0; i<SIZE; i++)

printf("C[%d]=%d\n",i,C[i]);



cudaFree(devPtrA);

cudaFree(devPtrB);

cudaFree(devPtrC);

return 0;

}



./bi_thread_block output:



A[0]=0

A[1]=1

A[2]=2

A[3]=3

A[4]=4

A[5]=5

A[6]=6

A[7]=7

A[8]=8

A[9]=9

B[0]=0

B[1]=0

B[2]=0

B[3]=0

B[4]=0

B[5]=0

B[6]=0

B[7]=0

B[8]=0

B[9]=0

C[0]=0

C[1]=1

C[2]=2

C[3]=3

C[4]=4

C[5]=5

C[6]=6

C[7]=7

C[8]=8

C[9]=9



ThreadIdx.y and index_y seem to be always 0.

#4
Posted 03/20/2012 04:35 PM   
I think in this case the problem is the way you submit the kernel. The code in the kernel is correct, but you have to submit the kernel in a different way.

you use add<<<5, 2>>>(devPtrA, devPtrB, devPtrC); This meens that you use 1D grids and 1D blocks, instead you can define :

dim3 blocks=dim3(bx,by,bz),threads=dim3(tx,ty,tz);

now use this add<<<blocks,threads>>>(devPtrA, devPtrB, devPtrC);

I think it is very good you try to understand this. This is very important and can save lots of trouble in the future it is done proeprly from the beginning.
I think in this case the problem is the way you submit the kernel. The code in the kernel is correct, but you have to submit the kernel in a different way.



you use add<<<5, 2>>>(devPtrA, devPtrB, devPtrC); This meens that you use 1D grids and 1D blocks, instead you can define :



dim3 blocks=dim3(bx,by,bz),threads=dim3(tx,ty,tz);



now use this add<<<blocks,threads>>>(devPtrA, devPtrB, devPtrC);



I think it is very good you try to understand this. This is very important and can save lots of trouble in the future it is done proeprly from the beginning.

#5
Posted 03/20/2012 05:44 PM   
Ok, I see.. I'll look through the way of submission more carefully.
Thanks again :)
Ok, I see.. I'll look through the way of submission more carefully.

Thanks again :)

#6
Posted 03/20/2012 07:44 PM   
[quote name='giajj' date='20 March 2012 - 07:44 PM' timestamp='1332272699' post='1385481']
Ok, I see.. I'll look through the way of submission more carefully.
Thanks again :)
[/quote]
Just to complte using this <<<5,2>>< is equivalent to <<<dim3(5,1,1),dim3(2,1,1)>>>
[quote name='giajj' date='20 March 2012 - 07:44 PM' timestamp='1332272699' post='1385481']

Ok, I see.. I'll look through the way of submission more carefully.

Thanks again :)



Just to complte using this <<<5,2>>< is equivalent to <<<dim3(5,1,1),dim3(2,1,1)>>>

#7
Posted 03/20/2012 09:00 PM   
Scroll To Top