Feedback on Iterative solver

Hi, I’ve just started using CUDA and I’ve finished my first program and wanted to get some feedback on it.

Here is the kernel

__global__ void CalculateOnGPUKernelSingle(Point* readVec, Point* writeVec, int size, DiskParameters* discParams)
{
	extern __shared__ Point s_mem[];
	unsigned int index = blockIdx.x * blockDim.x + threadIdx.x;

	if (index > size - 1 || index < 1)
		return;
	if (threadIdx.x < 1 || threadIdx.x >= blockDim.x - 1)
		return;
	s_mem[threadIdx.x] = readVec[index];
	__syncthreads();
	Point point;
	
	point.Set(&s_mem[threadIdx.x - 1], &s_mem[threadIdx.x + 1], discParams);
	writeVec[index] = point;
}

Almost all of the work is done in point.Set, which I moved over from the serial implementation. Essentially each point calculates is values based on its neighbors values in the previous step. I’m not going to post all of the code in it since most of it is the same:

__host__ __device__ void Point::Set(const Point* left,const Point* right, const DiskParameters* params)
{
	double lastSigma;

	CalcDSigmaDRoSecondApprox(left, right, params);
	CalcDSigmaDTau(left, right, params);
	CalcSigma(left, this, params);
	lastSigma = 0;

	while (abs(sigma - lastSigma) > EPSILON)
	{
		lastSigma = sigma;
		CalcDSigmaDTauSecondApprox(left, right, this, params);
		CalcSigma(left, this, params);
	}

}

//All the other calcs are similar to this one
__host__ __device__ void Point::CalcDSigmaDTau(const Point* left, const Point* right, const DiskParameters* params)
{
	double leftPow, rightPow, result = 0;
	leftPow = pow(left->sigma, 3);
	rightPow = pow(right->sigma, 3);

	result += params->Us0*(left->dSigmaDTau + right->dSigmaDTau);
	result += -(params->Us0*params->Us0)*(left->dSigmaDRo - right->dSigmaDRo);
	result += -params->deltaRo*(2 * params->alpha*(leftPow + rightPow) - params->gammaSq*(left->sigma + right->sigma));
	result /= (2*params->Us0);

	if (abs(result) < EPSILON)
		result = 0;
	dSigmaDTau = result;
}

The code below is how I launch my kernel

int n = m_hostDistribution[0].size();
thrust::device_vector<Point> d_vec[2];
d_vec[0].reserve(n); 
d_vec[1].reserve(n);
thrust::copy(m_hostDistribution[0].begin(), m_hostDistribution[0].end(), d_vec[0].begin());
thrust::copy(m_hostDistribution[1].begin(), m_hostDistribution[1].end(), d_vec[1].begin());
Point* d_readPtr;
Point* d_writePtr;
DiskParameters* d_diskParam;
cudaError_t rc = cudaMalloc((void**)&d_diskParam, sizeof(DiskParameters));
if (rc != cudaSuccess)
	printf("Could not allocate memory: %d", rc); 
cudaDeviceSynchronize();
checkCudaErrors(cudaMemcpyAsync(d_diskParam, &m_discParams, sizeof(DiskParameters), cudaMemcpyHostToDevice));

dim3 threadB(std::min(n, 1024), 1, 1);
dim3 blockB(std::max(1, n/1024), 1, 1);

for (i = 1; i < m_steps; ++i)
{
	
	readRow = (i + 1) % 2;
	writeRow = (readRow + 1) % 2;
	d_readPtr = thrust::raw_pointer_cast(d_vec[readRow].data());
	d_writePtr = thrust::raw_pointer_cast(d_vec[writeRow].data());
	CalculateOnGPUKernelSingle<< <blockB, threadB, 1024 * sizeof(Point)>> >(d_readPtr, d_writePtr, n, d_diskParam);
	checkCudaErrors(cudaGetLastError());
	thrust::copy(d_vec[writeRow].begin(), d_vec[writeRow].end(), m_hostDistribution[writeRow].begin());
}

I ran the program through NVVP and some of the key results were:

  1. Low Memcpy/Compute Overlap - I don't know if its possible to do these aysnc since each step needs the previous to finish (more on that later)
  2. Low Kernel Concurrency - Same as above
  3. Low Memcpy Throughput - This one comes and goes
  4. Occupancy hovers around 50%, theoretical is 50%(whatever that means)
  5. Kernel is bound by memory and instruction latency

My questions for all of this i just a general code review/suggestions.

The second part of this post has to do with changing how the kernel functions. Below is a naive method I tried of optimizing by increasing the work done pet thread. The performance went down a lot.

__global__ void CalculateOnGPUKernel(Point* readVec, Point* writeVec, int size, int height, DiskParameters* discParams)
{
	extern __shared__ Point s_points[];
	unsigned int tid = threadIdx.x;
	unsigned int index = blockIdx.x * blockDim.x + tid;
	if (index > size - 1 || index < 1)
		return;
	
	//Base of pyramid
	unsigned int base = 2 * height;
	//Current base
	unsigned int currBase = base;
	//index in shared mem
	unsigned int sharedIndex = tid * (base - 2);
	unsigned int offSet;
	//Local storage for points, hardcoded for base size of 8
	Point points[8];
	
	//Copy points to local;
	for (unsigned int i = 0; i < base; ++i)
		points[i] = readVec[index * 8 + i];

	//Calc top
	//Here we construct the pyramid
	//^ - go to shared to be used by left neighbouring thread
	//to be used for calculating points.
	//      * *
	//    ^ ^ * * 
	//  ^ ^ * * * * 
	//^ ^ * * * * * *
	while (currBase > 2)
	{
		currBase -= 2;
		offSet = sharedIndex + base - currBase;
		//Copy first 2 from prev row to shared
		s_points[offSet - 2] = points[0];
		s_points[offSet - 1] = points[1];
		
		for (unsigned int i = 0; i < currBase; ++i)
			points[i].Set(&points[i], &points[i + 2], discParams);
		
	}
	__syncthreads();
	
	if (tid >= blockDim.x - 1)
		return;
	
	//Here we construct reverse pyramid using the shared members of our right neighbour
	unsigned int i;
	while (currBase < base)
	{
		offSet = base - currBase;
		for (i = 0; i < currBase - 2; ++i)
			points[offSet + i].Set(&points[offSet + i], &points[offSet + i + 2], discParams);
		
		for (; i < currBase; ++i)
			points[offSet + i].Set(&points[offSet + i], &s_points[sharedIndex + (base - 2) + i], discParams);

		currBase += 2;
	}
	//We read 0 - 8, but right 4 - 12
	index += floor(base / 2.f);
	for (i = 0; i < base; ++i)
		writeVec[index + i] = points[i];
}

Is there any need to improve the original method? Is this second method redundant?