Thank you, Ken! :)
The way with additional global array works! It’s wonderful! code bellow
But doesn’t work the way with shared memory, maybe I’m wrong somewhere, please look if you could
//it doesn't work, code with shared memory
__global__ void incKernel (float * data)
{
int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ float temp[256];
if (xIndex < 256)
{
temp[xIndex] = data[xIndex];
}
__syncthreads();
if ((xIndex > 0) && (xIndex < 256))
{
data [xIndex] = temp[xIndex-1] + temp[xIndex] + temp[xIndex+1];
}
}
float * getSourceHostArray(int sizeArray);
void check_for_error(const char *er_str);
int main( int argc, char * argv [] )
{
int sizeArray = 256;
int numBytes = sizeArray * sizeof ( float );
float *hostArray = getSourceHostArray(sizeArray);
float * deviceArray = NULL;
cudaMalloc ( (void**)&deviceArray, numBytes );
dim3 threads = dim3(32, 1);
dim3 blocks = dim3(sizeArray / threads.x, 1);
cudaMemcpy ( deviceArray, hostArray, numBytes, cudaMemcpyHostToDevice );
incKernel<<<blocks, threads>>>(deviceArray);
check_for_error("");
cudaMemcpy ( hostArray, deviceArray, numBytes, cudaMemcpyDeviceToHost );
for ( int i = 0; i < 256; i++ )
{
printf ( "hostArray[%d]= %f\n", i, hostArray [i] );
}
cudaFree (deviceArray);
delete hostArray;
printf("Succeed!!!");
getch();
return 0;
}
//it works, code with additional global array
__global__ void incKernel (const float * src, float * dest)
{
int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
if ((xIndex > 0) && (xIndex < 256))
{
dest [xIndex] = src[xIndex-1] + src[xIndex] + src[xIndex+1];
}
}
float * getSourceHostArray(int sizeArray);
void check_for_error(const char *er_str);
int main( int argc, char * argv [] )
{
int sizeArray = 256;
int numBytes = sizeArray * sizeof ( float );
float *hostArray = getSourceHostArray(sizeArray);
float * srcDeviceArray = NULL;
float * destDeviceArray = NULL;
cudaMalloc ( (void**)&srcDeviceArray, numBytes );
cudaMalloc ( (void**)&destDeviceArray, numBytes );
cudaMemcpy ( srcDeviceArray, hostArray, numBytes, cudaMemcpyHostToDevice );
cudaMemcpy ( destDeviceArray, hostArray, numBytes, cudaMemcpyHostToDevice );
dim3 threads = dim3(32, 1);
dim3 blocks = dim3(sizeArray / threads.x, 1);
incKernel<<<blocks, threads>>>(srcDeviceArray, destDeviceArray);
check_for_error("");
cudaMemcpy ( hostArray, destDeviceArray, numBytes, cudaMemcpyDeviceToHost );
for ( int i = 0; i < 256; i++ )
{
printf ( "hostArray[%d]= %f\n", i, hostArray [i] );
}
cudaFree (srcDeviceArray);
cudaFree (destDeviceArray);
delete hostArray;
printf("Succeed!!!");
getch();
return 0;
}
float * getSourceHostArray(int sizeArray)
{
float *array = new float ;
for ( int i = 0; i < sizeArray; i++ )
{
array [i] = 1.0f;
}
return array;
}