barrier
Hi, I am new to OpenCL.
There is a problem making me confused.
Without barrier, C[0] and C[1] have the different value.
C[0] and C[1] should get the same value with the barrier function.

the code get the wrong answer.
where is the bug in the code?

opencl code
[code]
__kernel void barrier_example ( __global int *C )
{

//Get the index of the current element
int t = get_local_id(0);
__local int *a1 ;
a1 = 0 ;

//barrier test
if ( t == 1) {
for( int j = 0 ; j < 1000 ; j ++ ) ;
a1 = 100 ;
}

barrier ( CLK_LOCAL_MEM_FENCE ) ;

if ( t == 0 ) C [ t ] = a1 ;
if ( t == 1 ) C [ t ] = a1 ;
C [ 2 ] = 88 ;
}
[/code]

main.c
[code]
#include <stdio.h>
#include <stdlib.h>

#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif

#define MAX_SOURCE_SIZE (0x100000)

int main(void) {
// Create the two input vectors
const int LIST_SIZE = 1024;

// Load the kernel source code into the array source_str
FILE *fp;
char *source_str;
size_t source_size;

fp = fopen("barrier_example.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );

// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_ALL, 1,
&device_id, &ret_num_devices);

// Create an OpenCL context
cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);

// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);

// Create memory buffers on the device for each vector
cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);

// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);

// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);

// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "barrier_example", &ret);

// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&c_mem_obj);

// Execute the OpenCL kernel on the list
size_t global_item_size = 6 ; // Process the entire lists
size_t local_item_size = 3 ; // Process in groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);

// Read the memory buffer C on the device to the local variable C
int *C = (int*)malloc(sizeof(int)*3*2);
ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0,
3*2 * sizeof(int), C, 0, NULL, NULL);

// Display the result to the screen
for(int i = 0; i < 3; i++)
printf("i= %d , %d\n", i, C[i]);

// Clean up
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);

ret = clReleaseMemObject(c_mem_obj);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);

free(C);

return 0;
}
[/code]
Hi, I am new to OpenCL.

There is a problem making me confused.

Without barrier, C[0] and C[1] have the different value.

C[0] and C[1] should get the same value with the barrier function.



the code get the wrong answer.

where is the bug in the code?



opencl code



__kernel void barrier_example ( __global int *C )

{



//Get the index of the current element

int t = get_local_id(0);

__local int *a1 ;

a1 = 0 ;



//barrier test

if ( t == 1) {

for( int j = 0 ; j < 1000 ; j ++ ) ;

a1 = 100 ;

}



barrier ( CLK_LOCAL_MEM_FENCE ) ;



if ( t == 0 ) C [ t ] = a1 ;

if ( t == 1 ) C [ t ] = a1 ;

C [ 2 ] = 88 ;

}




main.c



#include <stdio.h>

#include <stdlib.h>



#ifdef __APPLE__

#include <OpenCL/opencl.h>

#else

#include <CL/cl.h>

#endif



#define MAX_SOURCE_SIZE (0x100000)



int main(void) {

// Create the two input vectors

const int LIST_SIZE = 1024;



// Load the kernel source code into the array source_str

FILE *fp;

char *source_str;

size_t source_size;



fp = fopen("barrier_example.cl", "r");

if (!fp) {

fprintf(stderr, "Failed to load kernel.\n");

exit(1);

}

source_str = (char*)malloc(MAX_SOURCE_SIZE);

source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);

fclose( fp );



// Get platform and device information

cl_platform_id platform_id = NULL;

cl_device_id device_id = NULL;

cl_uint ret_num_devices;

cl_uint ret_num_platforms;

cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);

ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_ALL, 1,

&device_id, &ret_num_devices);



// Create an OpenCL context

cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);



// Create a command queue

cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);



// Create memory buffers on the device for each vector

cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,

LIST_SIZE * sizeof(int), NULL, &ret);



// Create a program from the kernel source

cl_program program = clCreateProgramWithSource(context, 1,

(const char **)&source_str, (const size_t *)&source_size, &ret);



// Build the program

ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);



// Create the OpenCL kernel

cl_kernel kernel = clCreateKernel(program, "barrier_example", &ret);



// Set the arguments of the kernel

ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&c_mem_obj);



// Execute the OpenCL kernel on the list

size_t global_item_size = 6 ; // Process the entire lists

size_t local_item_size = 3 ; // Process in groups of 64

ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,

&global_item_size, &local_item_size, 0, NULL, NULL);



// Read the memory buffer C on the device to the local variable C

int *C = (int*)malloc(sizeof(int)*3*2);

ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0,

3*2 * sizeof(int), C, 0, NULL, NULL);



// Display the result to the screen

for(int i = 0; i < 3; i++)

printf("i= %d , %d\n", i, C[i]);



// Clean up

ret = clFlush(command_queue);

ret = clFinish(command_queue);

ret = clReleaseKernel(kernel);

ret = clReleaseProgram(program);



ret = clReleaseMemObject(c_mem_obj);

ret = clReleaseCommandQueue(command_queue);

ret = clReleaseContext(context);



free(C);



return 0;

}

#1
Posted 03/29/2012 05:01 AM   
You need to understand the relationship between pointers in private memory and data in local memory. The following line of code

[code]
__local int *a1 ;
[/code]

declares a pointer a1 in private memory that points into local memory. You then make this pointer point to address 0 in local memory.

[code]
a1 = 0 ;
[/code]

Nothing has touch the the actual local memory yet, you just changed the value of a variable in private memory. Then some work-item specific stuff happens when only work-item 1 sets its private instance of a1 to 100.

[code]
if ( t == 1)
a1 = 100 ;
[/code]

This is followed by a barrier...

[code]
barrier ( CLK_LOCAL_MEM_FENCE ) ;
[/code]

...that will do nothing since no one has written anything to local memory yet. In the end you let work item 0 and 1 write their respective contents of the private variable a1 to global memory.

[code]
if ( t == 0 ) C [ t ] = a1 ;
if ( t == 1 ) C [ t ] = a1 ;
[/code]

These will be 0 and 100, respectively.


From your description of the problem I get the impression that what you really want is to have the variable a1 in local memory instead of private memory. Try changing the first line I quoted to

[code]
__local int a1 ;
[/code]



I have a question myself. How do we declare a pointer in the local address space that points into the local address space? The rule so far has been to prepend the type name with __local, so by that rule it should be

[code]
__local __local int* a1 ;
[/code]


Have never tried it though, and I don't have any OpenCL compiler available at the moment. Does anyone know?
You need to understand the relationship between pointers in private memory and data in local memory. The following line of code





__local int *a1 ;




declares a pointer a1 in private memory that points into local memory. You then make this pointer point to address 0 in local memory.





a1 = 0 ;




Nothing has touch the the actual local memory yet, you just changed the value of a variable in private memory. Then some work-item specific stuff happens when only work-item 1 sets its private instance of a1 to 100.





if ( t == 1)

a1 = 100 ;




This is followed by a barrier...





barrier ( CLK_LOCAL_MEM_FENCE ) ;




...that will do nothing since no one has written anything to local memory yet. In the end you let work item 0 and 1 write their respective contents of the private variable a1 to global memory.





if ( t == 0 ) C [ t ] = a1 ;

if ( t == 1 ) C [ t ] = a1 ;




These will be 0 and 100, respectively.





From your description of the problem I get the impression that what you really want is to have the variable a1 in local memory instead of private memory. Try changing the first line I quoted to





__local int a1 ;








I have a question myself. How do we declare a pointer in the local address space that points into the local address space? The rule so far has been to prepend the type name with __local, so by that rule it should be





__local __local int* a1 ;






Have never tried it though, and I don't have any OpenCL compiler available at the moment. Does anyone know?

#2
Posted 03/29/2012 01:27 PM   
Scroll To Top