interaction in a array
Hello, sorry for my english

I have a task, one point are calculated via 12 neighboring points

Scheme is attached

In the middle there is a point that need to calculate

Code looks very roughly like this:
long x = blockIdx.x * blockDim.x + threadIdx.x;
long y = blockIdx.y * blockDim.y + threadIdx.y;
data [x][y] = data [x][y] + data [x][y+1];
data [x][y] = data [x][y] + data [x-1][y+1] + data [x][y+1] + data [x+1][y+1];
data [x][y] = data [x][y] + data [x-2][y] + data [x-1][y] + data [x+1][y] + data [x+2][y] ;
data [x][y] = data [x][y] + data [x-1][y-1] + data [x][y-1] + data [x+1][y-1];
data [x][y] = data [x][y] + data [x][y-1];

I don't understand how to get data from neighboring cells, or is it impossible?

I know that GPU is not best to solve this, but I have to solve via GPU with loss of perfomance
Hello, sorry for my english



I have a task, one point are calculated via 12 neighboring points



Scheme is attached



In the middle there is a point that need to calculate



Code looks very roughly like this:

long x = blockIdx.x * blockDim.x + threadIdx.x;

long y = blockIdx.y * blockDim.y + threadIdx.y;

data [x][y] = data [x][y] + data [x][y+1];

data [x][y] = data [x][y] + data [x-1][y+1] + data [x][y+1] + data [x+1][y+1];

data [x][y] = data [x][y] + data [x-2][y] + data [x-1][y] + data [x+1][y] + data [x+2][y] ;

data [x][y] = data [x][y] + data [x-1][y-1] + data [x][y-1] + data [x+1][y-1];

data [x][y] = data [x][y] + data [x][y-1];



I don't understand how to get data from neighboring cells, or is it impossible?



I know that GPU is not best to solve this, but I have to solve via GPU with loss of perfomance
Attachments

scheme.JPG

#1
Posted 05/09/2012 10:54 AM   
global memory, array[300][300]
global memory, array[300][300]

#2
Posted 05/09/2012 10:59 AM   
You can certainly implement this on a GPU, but I have three recommendations: (1) You are going to have to have separate source and destination. The function depends on the 12 neighbors, but as it is written, the code changes the input (data[][]) for other threads. (2) You're function will need boundary conditions to define what the value of the function at the edges of the space. (3) In the implementation, you should use shared memory in order to reduce the cost associated with multiple fetches of data[][] across multiple threads. However, you should first try an implementation that doesn't use shared memory as a baseline case. The code for the kernel looks OK, except for suggestions (1) and (2). In your host code, at a minimum you will need cudaMalloc() to allocate d_data_src, cudaMalloc() to allocate d_data_dest, cudaMemcpy() to copy the data on the host to d_data_src, a kernel call (i.e., foobar<<<>>>>(d_data_src, d_data_dest), ...), cudaMemcpy() to copy d_data_dest back to the CPU. Don't forget to check the return values from the API calls. --Ken
You can certainly implement this on a GPU, but I have three recommendations: (1) You are going to have to have separate source and destination. The function depends on the 12 neighbors, but as it is written, the code changes the input (data[][]) for other threads. (2) You're function will need boundary conditions to define what the value of the function at the edges of the space. (3) In the implementation, you should use shared memory in order to reduce the cost associated with multiple fetches of data[][] across multiple threads. However, you should first try an implementation that doesn't use shared memory as a baseline case. The code for the kernel looks OK, except for suggestions (1) and (2). In your host code, at a minimum you will need cudaMalloc() to allocate d_data_src, cudaMalloc() to allocate d_data_dest, cudaMemcpy() to copy the data on the host to d_data_src, a kernel call (i.e., foobar<<<>>>>(d_data_src, d_data_dest), ...), cudaMemcpy() to copy d_data_dest back to the CPU. Don't forget to check the return values from the API calls. --Ken

#3
Posted 05/09/2012 11:59 AM   
Hello,

The best way to solve this is using shared memory. Without shared memory each element is loaded 9 times from the memory. The other alternative is to put your problem as a sparse matrix and use cusparse library.
Hello,



The best way to solve this is using shared memory. Without shared memory each element is loaded 9 times from the memory. The other alternative is to put your problem as a sparse matrix and use cusparse library.

#4
Posted 05/09/2012 01:10 PM   
Thank you, Ken! :)
The way with additional global array works! It's wonderful! code bellow

But doesn't work the way with shared memory, maybe I'm wrong somewhere, please look if you could

[code]
//it doesn't work, code with shared memory

__global__ void incKernel (float * data)
{
int xIndex = blockIdx.x * blockDim.x + threadIdx.x;

__shared__ float temp[256];

if (xIndex < 256)
{
temp[xIndex] = data[xIndex];
}

__syncthreads();

if ((xIndex > 0) && (xIndex < 256))
{
data [xIndex] = temp[xIndex-1] + temp[xIndex] + temp[xIndex+1];
}
}

float * getSourceHostArray(int sizeArray);
void check_for_error(const char *er_str);

int main( int argc, char * argv [] )
{
int sizeArray = 256;
int numBytes = sizeArray * sizeof ( float );

float *hostArray = getSourceHostArray(sizeArray);

float * deviceArray = NULL;

cudaMalloc ( (void**)&deviceArray, numBytes );

dim3 threads = dim3(32, 1);
dim3 blocks = dim3(sizeArray / threads.x, 1);

cudaMemcpy ( deviceArray, hostArray, numBytes, cudaMemcpyHostToDevice );

incKernel<<<blocks, threads>>>(deviceArray);
check_for_error("");

cudaMemcpy ( hostArray, deviceArray, numBytes, cudaMemcpyDeviceToHost );

for ( int i = 0; i < 256; i++ )
{
printf ( "hostArray[%d]= %f\n", i, hostArray [i] );
}

cudaFree (deviceArray);

delete hostArray;

printf("Succeed!!!");
getch();

return 0;
}
[/code]


[code]
//it works, code with additional global array

__global__ void incKernel (const float * src, float * dest)
{
int xIndex = blockIdx.x * blockDim.x + threadIdx.x;

if ((xIndex > 0) && (xIndex < 256))
{
dest [xIndex] = src[xIndex-1] + src[xIndex] + src[xIndex+1];
}
}

float * getSourceHostArray(int sizeArray);
void check_for_error(const char *er_str);

int main( int argc, char * argv [] )
{
int sizeArray = 256;
int numBytes = sizeArray * sizeof ( float );

float *hostArray = getSourceHostArray(sizeArray);

float * srcDeviceArray = NULL;
float * destDeviceArray = NULL;

cudaMalloc ( (void**)&srcDeviceArray, numBytes );
cudaMalloc ( (void**)&destDeviceArray, numBytes );

cudaMemcpy ( srcDeviceArray, hostArray, numBytes, cudaMemcpyHostToDevice );
cudaMemcpy ( destDeviceArray, hostArray, numBytes, cudaMemcpyHostToDevice );

dim3 threads = dim3(32, 1);
dim3 blocks = dim3(sizeArray / threads.x, 1);

incKernel<<<blocks, threads>>>(srcDeviceArray, destDeviceArray);
check_for_error("");

cudaMemcpy ( hostArray, destDeviceArray, numBytes, cudaMemcpyDeviceToHost );

for ( int i = 0; i < 256; i++ )
{
printf ( "hostArray[%d]= %f\n", i, hostArray [i] );
}

cudaFree (srcDeviceArray);
cudaFree (destDeviceArray);

delete hostArray;

printf("Succeed!!!");
getch();

return 0;
}

float * getSourceHostArray(int sizeArray)
{
float *array = new float [sizeArray];

for ( int i = 0; i < sizeArray; i++ )
{
array [i] = 1.0f;
}
return array;
}
[/code]
Thank you, Ken! :)

The way with additional global array works! It's wonderful! code bellow



But doesn't work the way with shared memory, maybe I'm wrong somewhere, please look if you could





//it doesn't work, code with shared memory



__global__ void incKernel (float * data)

{

int xIndex = blockIdx.x * blockDim.x + threadIdx.x;



__shared__ float temp[256];



if (xIndex < 256)

{

temp[xIndex] = data[xIndex];

}



__syncthreads();



if ((xIndex > 0) && (xIndex < 256))

{

data [xIndex] = temp[xIndex-1] + temp[xIndex] + temp[xIndex+1];

}

}



float * getSourceHostArray(int sizeArray);

void check_for_error(const char *er_str);



int main( int argc, char * argv [] )

{

int sizeArray = 256;

int numBytes = sizeArray * sizeof ( float );



float *hostArray = getSourceHostArray(sizeArray);



float * deviceArray = NULL;



cudaMalloc ( (void**)&deviceArray, numBytes );



dim3 threads = dim3(32, 1);

dim3 blocks = dim3(sizeArray / threads.x, 1);



cudaMemcpy ( deviceArray, hostArray, numBytes, cudaMemcpyHostToDevice );



incKernel<<<blocks, threads>>>(deviceArray);

check_for_error("");



cudaMemcpy ( hostArray, deviceArray, numBytes, cudaMemcpyDeviceToHost );



for ( int i = 0; i < 256; i++ )

{

printf ( "hostArray[%d]= %f\n", i, hostArray [i] );

}



cudaFree (deviceArray);



delete hostArray;



printf("Succeed!!!");

getch();



return 0;

}








//it works, code with additional global array



__global__ void incKernel (const float * src, float * dest)

{

int xIndex = blockIdx.x * blockDim.x + threadIdx.x;



if ((xIndex > 0) && (xIndex < 256))

{

dest [xIndex] = src[xIndex-1] + src[xIndex] + src[xIndex+1];

}

}



float * getSourceHostArray(int sizeArray);

void check_for_error(const char *er_str);



int main( int argc, char * argv [] )

{

int sizeArray = 256;

int numBytes = sizeArray * sizeof ( float );



float *hostArray = getSourceHostArray(sizeArray);



float * srcDeviceArray = NULL;

float * destDeviceArray = NULL;



cudaMalloc ( (void**)&srcDeviceArray, numBytes );

cudaMalloc ( (void**)&destDeviceArray, numBytes );



cudaMemcpy ( srcDeviceArray, hostArray, numBytes, cudaMemcpyHostToDevice );

cudaMemcpy ( destDeviceArray, hostArray, numBytes, cudaMemcpyHostToDevice );



dim3 threads = dim3(32, 1);

dim3 blocks = dim3(sizeArray / threads.x, 1);



incKernel<<<blocks, threads>>>(srcDeviceArray, destDeviceArray);

check_for_error("");



cudaMemcpy ( hostArray, destDeviceArray, numBytes, cudaMemcpyDeviceToHost );



for ( int i = 0; i < 256; i++ )

{

printf ( "hostArray[%d]= %f\n", i, hostArray [i] );

}



cudaFree (srcDeviceArray);

cudaFree (destDeviceArray);



delete hostArray;



printf("Succeed!!!");

getch();



return 0;

}



float * getSourceHostArray(int sizeArray)

{

float *array = new float [sizeArray];



for ( int i = 0; i < sizeArray; i++ )

{

array [i] = 1.0f;

}

return array;

}

#5
Posted 05/09/2012 01:11 PM   
Hi Dmitry, It looks like the main problem is that the shared memory implementation doesn't separate source and destination that was done in the global memory implementation. Again, the problem is that not all threads all operate in parallel, not even within a single block. Otherwise, the code looks OK. In the general case, xIndex will be > 256. So, once you remove the <256 tests, you'll have to map xIndex into 0 to 256 (using "% 256"). --Ken
Hi Dmitry, It looks like the main problem is that the shared memory implementation doesn't separate source and destination that was done in the global memory implementation. Again, the problem is that not all threads all operate in parallel, not even within a single block. Otherwise, the code looks OK. In the general case, xIndex will be > 256. So, once you remove the <256 tests, you'll have to map xIndex into 0 to 256 (using "% 256"). --Ken

#6
Posted 05/10/2012 05:05 PM   
Thank you, Ken!:) It's true
Thank you, Ken!:) It's true

#7
Posted 06/08/2012 12:13 PM   
Scroll To Top