Hi all!
I have a situation where I receive data from a sensor which I want to process on the GPU. Since launching the kernel repeatedly has a lot of overhead, I instead want to keep the kernel running and feed it data in packages as I receive it. For this I have implemented a double buffering system, where the host fills a buffer, signals the kernel that the buffer is ready to be consumed and then begins filling the second buffer while the GPU works. The flags between the device and the host are just pinned memory which the device is able to get via DMA. The way I decide which buffer needs to be used is by incrementing a counter each cycle and then inspecting if the counter is odd or even. If the buffer is not ready (device side), a while loop keeps the thread busy while it waits. When the host has decided the job is done, a flag can be set to tell the threads to terminate.
However, the code I have no gives me a weird output. I’ve set up a trivial example of what I want to do below. Essentially all the threads are really doing for “work” is to put the current value of their iteration counter (the one that decides which buffer to use) into the output array. Since my host terminates the threads after filling the buffers 10 times, I would expect the output array to contain only 9’s at termination. However some array cells do and some contain 8’s or even 11’s…it’s very strange. I’m running all this on Ubuntu 16.04 using CUDA 9.0 on a GeForce GTX 1070 with max-q. So here is my code and the output I’m getting:
#include <stdio.h>
#define DATA_BUFFER_SIZE 32
#define EVENT_DIMENSION 4
#define NUM_TRANSFORMS 60
__device__ volatile int blockcounter1=0;
__device__ volatile int blockcounter2=0;
__global__ void doubleBuffer2D(int* buffer1, int* buffer2, volatile int* bufferstate1, volatile int* bufferstate2, int* outputList, volatile int* terminateFlag) {
int transformID = blockIdx.y;
int eventID = blockIdx.x*blockDim.x+threadIdx.x;
int outIdx = transformID*DATA_BUFFER_SIZE+eventID;
if(eventID<DATA_BUFFER_SIZE) {
int iteration=0;
while(*terminateFlag!=1) {
volatile int * state = (iteration%2==0)? bufferstate1:bufferstate2;
int * buffer = (iteration%2==0)? buffer1:buffer2;
volatile int * blockcounter = (iteration%2==0)? &blockcounter1:&blockcounter2;
while(*state==0);//Wait until the host releases the buffer
//Do something ======
outputList[outIdx]=iteration;
//Finish doing something =====
__syncthreads(); // wait for my block to finish
if (!threadIdx.x) atomicAdd((int *)blockcounter, 1); //Mark my block as finished
__threadfence(); //make sure everyone can see my atomicAdd before proceeding
if(transformID==0 && eventID==0) { // I'm the master thread!
while(*blockcounter<NUM_TRANSFORMS); //Wait for everyone to be done.
*blockcounter=0;
*state=0;//Release the buffer back to the host
}
iteration++;
}
}
}
int main() {
printf("Run \n");
int * data = (int*)malloc(DATA_BUFFER_SIZE*sizeof(int));
int * d_outputList;
int * d_buffer1;
int * d_buffer2;
volatile int * outputBuffer1_state; //when the state=0, the host is claiming the buffer, if state=1 the device is claiming the buffer, if state=-1 the buffer is released
volatile int * outputBuffer2_state;
volatile int * terminate_thread;
int h_outputList[NUM_TRANSFORMS * DATA_BUFFER_SIZE * EVENT_DIMENSION];
cudaMalloc(&d_outputList, sizeof(int) * NUM_TRANSFORMS * DATA_BUFFER_SIZE);
cudaMalloc(&d_buffer1, (sizeof(int) * DATA_BUFFER_SIZE * EVENT_DIMENSION));
cudaMalloc(&d_buffer2, (sizeof(int) * DATA_BUFFER_SIZE * EVENT_DIMENSION));
cudaHostAlloc(&outputBuffer1_state, sizeof(int), cudaHostAllocMapped);
cudaHostAlloc(&outputBuffer2_state, sizeof(int), cudaHostAllocMapped);
cudaHostAlloc(&terminate_thread, sizeof(int), cudaHostAllocMapped);
cudaStream_t streamk, streamc;
cudaStreamCreate(&streamk);
cudaStreamCreate(&streamc);
*outputBuffer1_state = 0;
*outputBuffer2_state = 0;
*terminate_thread = -1;
int blockwidth = DATA_BUFFER_SIZE;
int blocks = DATA_BUFFER_SIZE;
int gridWidth = (blocks+blockwidth-1)/blockwidth;
dim3 gridDim(gridWidth, NUM_TRANSFORMS);
printf("Launching kernel with grid=%d,%d, block=%d \n", gridWidth, NUM_TRANSFORMS, blocks);
doubleBuffer2D<<<gridDim, blocks, 0, streamk>>>(d_buffer1, d_buffer2, outputBuffer1_state, outputBuffer2_state, d_outputList, terminate_thread);
for (int i = 0; i < 10; i++) {
printf("========== Iteration %d ==========\n", i);
int * output_buffer = (i % 2 == 0) ? d_buffer1 : d_buffer2;
volatile int * buffer_state = (i % 2 == 0) ? outputBuffer1_state : outputBuffer2_state;
if(i%2==0)printf("Buffer=buffer1, state=%d \n", *buffer_state);
if(i%2==1)printf("Buffer=buffer2, state=%d \n", *buffer_state);
printf(" Waiting for release...");
while (*buffer_state == 1);// printf(" Waiting for release..."); //wait for the device to release the buffer
printf("\n Released! Buffer state = %d",*buffer_state);
printf("\n Copying memory...");
cudaMemcpyAsync(output_buffer, &data[0], sizeof(int) * DATA_BUFFER_SIZE * EVENT_DIMENSION, cudaMemcpyHostToDevice, streamc);
printf("\n Waiting for copy...");
cudaStreamSynchronize(streamc); //Wait for the copy to be done
*buffer_state = 1; //Release the buffer to the device
printf("\n Releasing Buffer...\n Buffer state = %d \n",*buffer_state);
printf("val i=%d \n",i);
}
*terminate_thread = 1; //release the device
cudaStreamSynchronize(streamk);
cudaMemcpy(&h_outputList[0], d_outputList, sizeof(int) * NUM_TRANSFORMS* DATA_BUFFER_SIZE, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
for(int i=0; i<NUM_TRANSFORMS; i++) {
for(int j=0; j<DATA_BUFFER_SIZE; j++) {
int idx = i*DATA_BUFFER_SIZE+j;
printf("%d, ",h_outputList[idx]);
}
printf("\n");
}
printf("\n");
printf("Done \n");
return 0;
}
output:
Run
Launching kernel with grid=1,60, block=32
========== Iteration 0 ==========
Buffer=buffer1, state=0
Waiting for release...
Released! Buffer state = 0
Copying memory...
Waiting for copy...
Releasing Buffer...
Buffer state = 1
val i=0
========== Iteration 1 ==========
Buffer=buffer2, state=0
Waiting for release...
Released! Buffer state = 0
Copying memory...
Waiting for copy...
Releasing Buffer...
Buffer state = 1
val i=1
========== Iteration 2 ==========
Buffer=buffer1, state=1
Waiting for release...
Released! Buffer state = 0
Copying memory...
Waiting for copy...
Releasing Buffer...
Buffer state = 1
val i=2
========== Iteration 3 ==========
Buffer=buffer2, state=1
Waiting for release...
Released! Buffer state = 0
Copying memory...
Waiting for copy...
Releasing Buffer...
Buffer state = 1
val i=3
========== Iteration 4 ==========
Buffer=buffer1, state=1
Waiting for release...
Released! Buffer state = 0
Copying memory...
Waiting for copy...
Releasing Buffer...
Buffer state = 1
val i=4
========== Iteration 5 ==========
Buffer=buffer2, state=1
Waiting for release...
Released! Buffer state = 0
Copying memory...
Waiting for copy...
Releasing Buffer...
Buffer state = 1
val i=5
========== Iteration 6 ==========
Buffer=buffer1, state=1
Waiting for release...
Released! Buffer state = 0
Copying memory...
Waiting for copy...
Releasing Buffer...
Buffer state = 1
val i=6
========== Iteration 7 ==========
Buffer=buffer2, state=1
Waiting for release...
Released! Buffer state = 0
Copying memory...
Waiting for copy...
Releasing Buffer...
Buffer state = 1
val i=7
========== Iteration 8 ==========
Buffer=buffer1, state=1
Waiting for release...
Released! Buffer state = 0
Copying memory...
Waiting for copy...
Releasing Buffer...
Buffer state = 1
val i=8
========== Iteration 9 ==========
Buffer=buffer2, state=1
Waiting for release...
Released! Buffer state = 0
Copying memory...
Waiting for copy...
Releasing Buffer...
Buffer state = 1
val i=9
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
Done
Basically that large matrix at the bottom should contain only 9’s…well if anyone has spotted the mistake I’d be thrilled to find out what I’ve done wrong. Many thanks!
Timo