I am using CUDA 9.1 with VS2015 on a Win10 machine using C++.
I am doing something that I thought would be simple but am stuck on it.
I have a function that takes in a listing of pixel points, tests to see if three points are in a line and, if so, marks the middle point invalid. The function later compresses the points list so that all the valid points are next to each other, records which points are to the left and right of each point, and updates the total number of valid points. The idea is to keep doing this until there are no more points left to invalidate.
One issue is that the number of points may be several multiples of the number of threads for each block (512). Thus, I have a variable (cyclecount) that can increment and the multiple of that used to index into the points list.
All of that is the set up for my problem. I am seeing weird crashes when I try and initialize the cyclecount (line 49 iin the listing below). I am using __synchthreads inside conditionals but only in ways that all the threads should reach the same point. When I include the cyclecount initialization, we crash. When I leave it out, it runs fine but will not go through the loop more than once.
Any help understanding what the problem is appreciated.
My code is as follows:
__global__ void gpuStripper(int numContours, int *onumElements, int *ocumElements, bool *oValidCont, int *oxvals, int *oyvals, unsigned int*oContNum, unsigned int*opointnum, int *osnumElements, int *sxvals, int *syvals, unsigned int*oleftNeighborNum, unsigned int*orightNeighborNum, bool *svalid)
{
// this function strips points that are colinear with their surrounding points
// if a point is colinear, it is marked invalid
// then valid points are compressed against each other by moving the x and y points and also moving the valid indication
// the function updates the scrubbed number of elements count - snumelements
// the function closes by correcting the left and right neighbor references
__shared__ int tnumElements;
__shared__ int tocumElements;
__shared__ int cyclecount;
__shared__ bool globalend;
__shared__ bool compressorend;
int tidx;
int offset = 0;
int s1, s2;
int top, bottom;
float tresult;
tidx = threadIdx.x;
if (tidx == 0)
{
tnumElements = onumElements[blockIdx.x];
tocumElements = ocumElements[blockIdx.x];
cyclecount = 0;
globalend = false;
compressorend = false;
}
__syncthreads();
// look for collinear points and delete them - same as OpenCV CHAIN_APPROX_SIMPLE
// check here for numbers less than 3
if (tnumElements > DCE_MIN_POINTS)
{
while (!globalend)
{
if (tidx == 0)
{
globalend = true;
compressorend = false;
cyclecount = 0; // INCLUDING THIS LINE CAUSES A CRASH
}
__syncthreads();
while ((cyclecount*DCE_THREADS_PER_BLOCK) < tnumElements)
{
offset = tidx + (cyclecount*DCE_THREADS_PER_BLOCK);
if (offset < tnumElements)
{
// note: we are not looking at the colinearity of the first or last element
if ((offset > 0) && (offset < tnumElements - 2))
{
// simple check of contiguous XY values
if ((sxvals[offset + tocumElements - 1] == sxvals[offset + tocumElements]) && (sxvals[offset + tocumElements + 1] == sxvals[offset + tocumElements]))
{
svalid[offset + tocumElements] = false;
globalend = false;
}
else if ((syvals[offset + tocumElements - 1] == syvals[offset + tocumElements]) && (syvals[offset + tocumElements + 1] == syvals[offset + tocumElements]))
{
svalid[offset + tocumElements] = false;
globalend = false;
}
else
{
// consider adding a tolerance factor here
if ((syvals[offset + tocumElements] - syvals[offset + tocumElements - 1]) * (sxvals[offset + tocumElements + 1] - sxvals[offset + tocumElements]) == (syvals[offset + tocumElements + 1] - syvals[offset + tocumElements]) * (sxvals[offset + tocumElements] - sxvals[offset + tocumElements] - 1))
{
svalid[offset + tocumElements] = false;
globalend = false;
}
}
}
}
if (tidx == 0)
cyclecount++;
__syncthreads();
}
__syncthreads();
}
__syncthreads();
if (tidx == 0)
{
// this is a double loop that should only go through things once
compressorend = false;
top = 0;
for (int x = 0; x < tnumElements - 1; x++)
{
if ((!svalid[x + tocumElements]) && (!compressorend))
{
compressorend = true;
if (top <= x)
top = x + 1;
for (int y = top; y < tnumElements; y++)
{
if (svalid[y + tocumElements])
{
sxvals[x + tocumElements] = sxvals[y + tocumElements];
syvals[x + tocumElements] = syvals[y + tocumElements];
opointnum[x + tocumElements] = opointnum[y + tocumElements];
svalid[x + tocumElements] = true;
svalid[y + tocumElements] = false;
top = y;
compressorend = false;
break;
}
}
}
else if (compressorend)
x = tnumElements;
}
}
// update the number of elements for the contour
__syncthreads();
if (tidx == 0)
{
// this is a binary search
top = tnumElements-1;
bottom = 0;
while ((top != bottom) && (top > bottom))
{
if (!svalid[((top + bottom) / 2) + tocumElements])
top = ((top + bottom) / 2) - 1;
else
bottom = ((top + bottom) / 2) + 1;
}
osnumElements[blockIdx.x] = top + 1;
}
__syncthreads();
// fix the left and right neighbor references
if (tidx == 0)
cyclecount = 0;
__syncthreads();
while ((cyclecount*DCE_THREADS_PER_BLOCK) < tnumElements)
{
offset = tidx + (cyclecount*DCE_THREADS_PER_BLOCK);
if ((offset > 0) && (offset < (osnumElements[blockIdx.x] - 1)))
{
oleftNeighborNum[offset + tocumElements] = offset - 1;
orightNeighborNum[offset + tocumElements] = offset + 1;
}
if (tidx == 0)
cyclecount++;
__syncthreads();
}
__syncthreads();
// correct the first and last elements
if (tidx == 0)
{
// first element
oleftNeighborNum[tocumElements] = osnumElements[blockIdx.x] - 1;
orightNeighborNum[tocumElements] = 1;
// last element
oleftNeighborNum[osnumElements[blockIdx.x] - 1 + tocumElements] = osnumElements[blockIdx.x] - 2;
orightNeighborNum[osnumElements[blockIdx.x] - 1 + tocumElements] = 0;
}
}
__syncthreads();
}