Why is simple copy slower than calculations + copy?

Hello,

I have a very strange problem: I want to copy data of triangles from one array (trianglesSrc) to another (trianglesDst).
There are two alternatives:
one is simply to copy the data from trianglesSrc to trianglesDst, which is implemented as cpyTriangles1.
The other is to do some pre-calculations to conduct Woop ray/intersection tests later before copying, which is implemented as cpyTriangles2.

The question is, under release mode, cpyTriangles2(c.a.0.44ms) is always faster than cpyTriangles1(c.a.0.58ms) for 1000000 triangles.
I have also tried to take a look at the sass code, I cannot fully understand the sass code, but for cpyTriangles2 there are nearly 400 lines and for cpyTriangles1 there are only less than 100 lines sass code.
Could anyone explain this?
I have a Geforce 1080 card, and the code is compiled with compute capability 6.1.

int main() {

	// Generate triangle data
	float4 *trianglesSrc, *trianglesDst;
	unsigned int size = 1000000;
	cudaMalloc((void **)&trianglesSrc, size * 3 * sizeof(float4));
	cudaMalloc((void **)&trianglesDst, size * 3 * sizeof(float4));

	unsigned int numThreads = 256u;
	unsigned int numBlocks = (size + numThreads - 1u) / numThreads;
	initTriangles << < numBlocks, numThreads >> > (trianglesSrc, size);

	// Copy triangle data
	float time;
	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	cudaEventRecord(start, 0);
	// ??? cpyTriangles1  is slower than cpyTriangles2?
	cpyTriangles1 << < numBlocks, numThreads >> > (trianglesDst, trianglesSrc, size); // 0.58s
																					//  cpyTriangles2 << < numBlocks, numThreads >> > (trianglesDst, trianglesSrc, size); // 0.44s
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&time, start, stop);
	cudaEventDestroy(start);
	cudaEventDestroy(stop);

	printf("time %f\n", time);
	
}

other code:

#include <cstdio>
#include <vector_types.h>

#define A 1664525u 
#define C 1013904223u

__global__
void initTriangles(float4 *triangles, unsigned int size) {
	unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x;

	if (idx >= size)
		return;

	idx *= 3u;
	float x = ((idx++)* A) * 1.f / 1013904223u;
	float y = ((idx++) * A) * 1.f / 1013904223u;
	float z = ((idx++) * A) * 1.f / 1013904223u;
	triangles[idx] = make_float4(x, y, z, 1.f);

	x = ((idx++) * A) * 1.f / 1013904223u;
	y = ((idx++) * A) * 1.f / 1013904223u;
	z = ((idx++) * A) * 1.f / 1013904223u;
	triangles[idx + 1u] = make_float4(x, y, z, 1.f);

	x = ((idx++) * A) * 1.f / 1013904223u;
	y = ((idx++) * A) * 1.f / 1013904223u;
	z = ((idx++) * A) * 1.f / 1013904223u;
	triangles[idx + 2u] = make_float4(x, y, z, 1.f);
}

__global__
void cpyTriangles1(float4 *trianglesDst, float4 *trianglesSrc, unsigned int size) {
	unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x;

	if (idx >= size)
		return;

	idx *= 3u;

	trianglesDst[idx] = trianglesSrc[idx];
	trianglesDst[idx + 1u] = trianglesSrc[idx + 1u];
	trianglesDst[idx + 2u] = trianglesSrc[idx + 2u];
}

__host__ __device__ bool gluInvertMatrix(const float m[16], float invOut[16])
{
	float inv[16], det;
	int i;

	inv[0] = m[5] * m[10] * m[15] -
		m[5] * m[11] * m[14] -
		m[9] * m[6] * m[15] +
		m[9] * m[7] * m[14] +
		m[13] * m[6] * m[11] -
		m[13] * m[7] * m[10];

	inv[4] = -m[4] * m[10] * m[15] +
		m[4] * m[11] * m[14] +
		m[8] * m[6] * m[15] -
		m[8] * m[7] * m[14] -
		m[12] * m[6] * m[11] +
		m[12] * m[7] * m[10];

	inv[8] = m[4] * m[9] * m[15] -
		m[4] * m[11] * m[13] -
		m[8] * m[5] * m[15] +
		m[8] * m[7] * m[13] +
		m[12] * m[5] * m[11] -
		m[12] * m[7] * m[9];

	inv[12] = -m[4] * m[9] * m[14] +
		m[4] * m[10] * m[13] +
		m[8] * m[5] * m[14] -
		m[8] * m[6] * m[13] -
		m[12] * m[5] * m[10] +
		m[12] * m[6] * m[9];

	inv[1] = -m[1] * m[10] * m[15] +
		m[1] * m[11] * m[14] +
		m[9] * m[2] * m[15] -
		m[9] * m[3] * m[14] -
		m[13] * m[2] * m[11] +
		m[13] * m[3] * m[10];

	inv[5] = m[0] * m[10] * m[15] -
		m[0] * m[11] * m[14] -
		m[8] * m[2] * m[15] +
		m[8] * m[3] * m[14] +
		m[12] * m[2] * m[11] -
		m[12] * m[3] * m[10];

	inv[9] = -m[0] * m[9] * m[15] +
		m[0] * m[11] * m[13] +
		m[8] * m[1] * m[15] -
		m[8] * m[3] * m[13] -
		m[12] * m[1] * m[11] +
		m[12] * m[3] * m[9];

	inv[13] = m[0] * m[9] * m[14] -
		m[0] * m[10] * m[13] -
		m[8] * m[1] * m[14] +
		m[8] * m[2] * m[13] +
		m[12] * m[1] * m[10] -
		m[12] * m[2] * m[9];

	inv[2] = m[1] * m[6] * m[15] -
		m[1] * m[7] * m[14] -
		m[5] * m[2] * m[15] +
		m[5] * m[3] * m[14] +
		m[13] * m[2] * m[7] -
		m[13] * m[3] * m[6];

	inv[6] = -m[0] * m[6] * m[15] +
		m[0] * m[7] * m[14] +
		m[4] * m[2] * m[15] -
		m[4] * m[3] * m[14] -
		m[12] * m[2] * m[7] +
		m[12] * m[3] * m[6];

	inv[10] = m[0] * m[5] * m[15] -
		m[0] * m[7] * m[13] -
		m[4] * m[1] * m[15] +
		m[4] * m[3] * m[13] +
		m[12] * m[1] * m[7] -
		m[12] * m[3] * m[5];

	inv[14] = -m[0] * m[5] * m[14] +
		m[0] * m[6] * m[13] +
		m[4] * m[1] * m[14] -
		m[4] * m[2] * m[13] -
		m[12] * m[1] * m[6] +
		m[12] * m[2] * m[5];

	inv[3] = -m[1] * m[6] * m[11] +
		m[1] * m[7] * m[10] +
		m[5] * m[2] * m[11] -
		m[5] * m[3] * m[10] -
		m[9] * m[2] * m[7] +
		m[9] * m[3] * m[6];

	inv[7] = m[0] * m[6] * m[11] -
		m[0] * m[7] * m[10] -
		m[4] * m[2] * m[11] +
		m[4] * m[3] * m[10] +
		m[8] * m[2] * m[7] -
		m[8] * m[3] * m[6];

	inv[11] = -m[0] * m[5] * m[11] +
		m[0] * m[7] * m[9] +
		m[4] * m[1] * m[11] -
		m[4] * m[3] * m[9] -
		m[8] * m[1] * m[7] +
		m[8] * m[3] * m[5];

	inv[15] = m[0] * m[5] * m[10] -
		m[0] * m[6] * m[9] -
		m[4] * m[1] * m[10] +
		m[4] * m[2] * m[9] +
		m[8] * m[1] * m[6] -
		m[8] * m[2] * m[5];

	det = m[0] * inv[0] + m[1] * inv[4] + m[2] * inv[8] + m[3] * inv[12];

	if (det == 0)
		return false;

	det = 1.0f / det;

	for (i = 0; i < 16; i++)
		invOut[i] = inv[i] * det;

	return true;
}

inline __host__ __device__ float4 cross(float4 a, float4 b)
{
	return make_float4(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x, 0.f);
}

inline __host__ __device__ float4 operator-(float4 a, float4 b)
{
	return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
}

__global__
void cpyTriangles2(float4 *trianglesDst, float4 *trianglesSrc, unsigned int size) {
	unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x;

	if (idx >= size)
		return;

	idx *= 3u;

	float4 v0 = trianglesSrc[idx];
	float4 v1 = trianglesSrc[idx + 1u];
	float4 v2 = trianglesSrc[idx + 2u];

	float4 mtxIn[4], mtx[4];
	mtxIn[0] = v0 - v2;
	mtxIn[1] = v1 - v2;
	mtxIn[2] = cross(v0 - v2, v1 - v2);
	mtxIn[3] = v2;
	gluInvertMatrix((float *)mtxIn, (float *)mtx);

	trianglesDst[idx] = make_float4(mtx[0].z, mtx[1].z, mtx[2].z, -mtx[3].z);
	trianglesDst[idx + 1u] = make_float4(mtx[0].x, mtx[1].x, mtx[2].x, mtx[3].x);
	trianglesDst[idx + 2u] = make_float4(mtx[0].y, mtx[1].y, mtx[2].y, mtx[3].y);
}

Your code has errors and won’t compile for me.

Hello, I am sorry, I have corrected the error. Now it compiles.

I can confirm your observation for a Quadro P2000: 1.149 milliseconds with cpyTriangles1(), 0.813 milliseconds with cpyTriangles2(). During a cursory inspection of the source code, I did not spot any obvious discrepancies in the per-thread memory access patterns of the two kernels.

However, the additional computation in cpyTriangles2() leads to delays between loads and stores that cause a completely different stream of memory transactions to be delivered to the GPU memory controller, which might explain the difference, caused by different efficiencies of memory access. I note that the access patterns of both kernels are not following the optimal base + thread-index access pattern.

You might want to look at the details of the memory performance counters of the CUDA profiler to see whether you can spot some interesting difference between the two kernels. Also, check whether there is sufficient occupancy.

Hi njuffa,

thank you for your advice. I have used the CUDA profiler to analyze the code.
But for memory performance counters(I checked them on the Timeline page), there is no difference when the curves go from the initTriangles phase to the cpyTriangles1/cpyTriangles2 phase.
Besides, the report shows that cpyTriangles1 achieves 100% Occupancy (but is slower)
while cpyTriangles2 only achieves 50% Occupancy (but is faster).
And the Max Block Limit Registers (Maximum thread blocks per multi-processor limited by registers per multi-processor) for cpyTriangles1 is 10 while it is 4 for cpyTriangles2.
What do you think about it?

p.s. I have observed the same thing on a 1070 card, though now the difference is smaller.