performance difference for cuda between experiments and the documentation for float/double data type...

Feng17 · October 27, 2016, 8:33am

As described in table 2 in the cuda c programming guide http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#arithmetic-instructions, the number of operations per clock cycle per multiprocessor for 32-bit floating-point add is 128, while it is 4 for 64-bit floating-point add, namely, 32 times slower for 64-bit floating-point add.

However, as I used the following code to test the speed difference, the float version is only 9 to 10 times faster than the double version, does anyone know the reason?

#define N 1000
    typedef double Real;// double or float
    
    // Device code
     __global__ void VecAdd(Real* A, Real* B, Real* C)
    {
    	int i = blockDim.x * blockIdx.x + threadIdx.x;
    	if (i < N) {
    		Real a = A[i];
    		Real b = B[i];
    		Real c = 0.0f;
    		for (int j = 0; j < 10000; j++)
    		{
    			c += (a + b);
    		}
    		C[i] = c;
    	}
    }
    
    // Host code
    int main()
    {
    	size_t size = N * sizeof(Real);
    	// Allocate input vectors h_A and h_B in host memory
    	Real* h_A = (Real*)malloc(size);
    	Real* h_B = (Real*)malloc(size);
    	Real* h_C = (Real*)malloc(size);
    	// Initialize input vectors
    	for (int i = 0; i < N; i++)
    	{
    		h_A[i] = 1.0f + i * 0.1f;
    		h_B[i] = 100.0f + i * 0.1f;
    	}
    	// Allocate vectors in device memory
    	Real* d_A;
    	cudaMalloc(&d_A, size);
    	Real* d_B;
    	cudaMalloc(&d_B, size);
    	Real* d_C;
    	cudaMalloc(&d_C, size);
    	// Copy vectors from host memory to device memory
    	cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    	cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
    	// Invoke kernel
    	int threadsPerBlock = 256;
    	int blocksPerGrid =
    		(N + threadsPerBlock - 1) / threadsPerBlock;
    
    	// Time measurement starts
    	cudaEvent_t     start, stop;
    	cudaEventCreate(&start);
    	cudaEventCreate(&stop);
    	cudaEventRecord(start, 0);
    	cudaEventSynchronize(start);
    
    	for (int i = 0; i < 10000; i++)
    	{
    		VecAdd << <blocksPerGrid, threadsPerBlock >> >(d_A, d_B, d_C);
    	}
    	
    	// Time measurement ends
    	cudaEventRecord(stop, 0);
    	cudaEventSynchronize(stop);
    	float   elapsedTime;
    	cudaEventElapsedTime(&elapsedTime, start, stop);
    	printf("Time to generate:  %3.8f ms\n", elapsedTime);
    	cudaEventDestroy(start);
    	cudaEventDestroy(stop);
    
    	// Copy result from device memory to host memory
    	// h_C contains the result in host memory
    	cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
    	// Free device memory
    	cudaFree(d_A);
    	cudaFree(d_B);
    	cudaFree(d_C);
    	// Free host memory
    	free(h_A);
    	free(h_B);
    	free(h_C);
    }

I used the following compile command:

nvcc -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin" -O0 main.cu -o main

and CUDA toolkit 8.0 and my system is 64bit windows 10 with GeForce 1080, driver version 372.90.
Here is the ptx file.

//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-20732876
// Cuda compilation tools, release 8.0, V8.0.26
// Based on LLVM 3.4svn
//

.version 5.0
.target sm_20
.address_size 32

	// .globl	_Z6VecAddPfS_S_

.visible .entry _Z6VecAddPfS_S_(
	.param .u32 _Z6VecAddPfS_S__param_0,
	.param .u32 _Z6VecAddPfS_S__param_1,
	.param .u32 _Z6VecAddPfS_S__param_2
)
{
	.reg .pred 	%p<3>;
	.reg .f32 	%f<57>;
	.reg .b32 	%r<20>;


	ld.param.u32 	%r5, [_Z6VecAddPfS_S__param_0];
	ld.param.u32 	%r6, [_Z6VecAddPfS_S__param_1];
	ld.param.u32 	%r7, [_Z6VecAddPfS_S__param_2];
	mov.u32 	%r8, %ctaid.x;
	mov.u32 	%r9, %ntid.x;
	mov.u32 	%r10, %tid.x;
	mad.lo.s32 	%r1, %r8, %r9, %r10;
	setp.gt.s32	%p1, %r1, 999;
	@%p1 bra 	BB0_4;

	cvta.to.global.u32 	%r2, %r7;
	cvta.to.global.u32 	%r12, %r5;
	shl.b32 	%r13, %r1, 2;
	add.s32 	%r14, %r12, %r13;
	cvta.to.global.u32 	%r15, %r6;
	add.s32 	%r16, %r15, %r13;
	ld.global.f32 	%f5, [%r16];
	ld.global.f32 	%f6, [%r14];
	add.f32 	%f1, %f6, %f5;
	mov.f32 	%f56, 0f00000000;
	mov.u32 	%r19, 10000;

BB0_2:
	add.f32 	%f7, %f1, %f56;
	add.f32 	%f8, %f1, %f7;
	add.f32 	%f9, %f1, %f8;
	add.f32 	%f10, %f1, %f9;
	add.f32 	%f11, %f1, %f10;
	add.f32 	%f12, %f1, %f11;
	add.f32 	%f13, %f1, %f12;
	add.f32 	%f14, %f1, %f13;
	add.f32 	%f15, %f1, %f14;
	add.f32 	%f16, %f1, %f15;
	add.f32 	%f17, %f1, %f16;
	add.f32 	%f18, %f1, %f17;
	add.f32 	%f19, %f1, %f18;
	add.f32 	%f20, %f1, %f19;
	add.f32 	%f21, %f1, %f20;
	add.f32 	%f22, %f1, %f21;
	add.f32 	%f23, %f1, %f22;
	add.f32 	%f24, %f1, %f23;
	add.f32 	%f25, %f1, %f24;
	add.f32 	%f26, %f1, %f25;
	add.f32 	%f27, %f1, %f26;
	add.f32 	%f28, %f1, %f27;
	add.f32 	%f29, %f1, %f28;
	add.f32 	%f30, %f1, %f29;
	add.f32 	%f31, %f1, %f30;
	add.f32 	%f32, %f1, %f31;
	add.f32 	%f33, %f1, %f32;
	add.f32 	%f34, %f1, %f33;
	add.f32 	%f35, %f1, %f34;
	add.f32 	%f36, %f1, %f35;
	add.f32 	%f37, %f1, %f36;
	add.f32 	%f38, %f1, %f37;
	add.f32 	%f39, %f1, %f38;
	add.f32 	%f40, %f1, %f39;
	add.f32 	%f41, %f1, %f40;
	add.f32 	%f42, %f1, %f41;
	add.f32 	%f43, %f1, %f42;
	add.f32 	%f44, %f1, %f43;
	add.f32 	%f45, %f1, %f44;
	add.f32 	%f46, %f1, %f45;
	add.f32 	%f47, %f1, %f46;
	add.f32 	%f48, %f1, %f47;
	add.f32 	%f49, %f1, %f48;
	add.f32 	%f50, %f1, %f49;
	add.f32 	%f51, %f1, %f50;
	add.f32 	%f52, %f1, %f51;
	add.f32 	%f53, %f1, %f52;
	add.f32 	%f54, %f1, %f53;
	add.f32 	%f55, %f1, %f54;
	add.f32 	%f56, %f1, %f55;
	add.s32 	%r19, %r19, -50;
	setp.ne.s32	%p2, %r19, 0;
	@%p2 bra 	BB0_2;

	add.s32 	%r18, %r2, %r13;
	st.global.f32 	[%r18], %f56;

BB0_4:
	ret;
}

Robert_Crovella · October 27, 2016, 1:11pm

It’s generally not a good idea to run performance analysis with -O0 or anything less than full optimization.

I know why you did it here (to prevent the compiler from optimizing your for loop with a multiplication) but there may be other important optimizations being done (e.g. register scheduling) that occur during the optimization phases that you have prevented with -O0

Instead, try and construct a loop in your kernel code that produces a stream of add instructions without resorting to -O0

Also, I note that you’ve dumped the PTX, but what matters is the SASS. Your compile command is creating cc 2.0 PTX code (and SASS code) but when run on a cc6.1 GPU, the cc 2.0 PTX will be forward JIT-compiled to create cc 6.1 SASS code. So the PTX code may or may not bear any resemblance to what is actually running on the GPU. Instead compile for a cc 6.1 target and inspect the SASS, to determine what is actually running.

Alternatively you could analyze your code in the visual profiler, and see what the stall reasons are.

njuffa · October 27, 2016, 3:25pm

In addition to what txbob said, even with fully optimized code you will rarely find floating-point intensive code where the performance ratio of float/double comes close to the theoretical ratio of the throughput of float/double operations.

The typical scenario is that the double-precision version is very close to completely limited by the low throughput of DP instructions (one might observe ~95% of the theoretical DP FLOPS on such code), while the single-precision version is limited by any number of different throughput limits and stalls.

For one, real-life floating-point intensive code tends to have many more non-floating-point instructions than one might think (e.g. integer arithmetic for address computations, control flow such as loops, loads and stores), often about half of the instructions are not in the floating-point class. This is easy to see from looking at the machine code (SASS) with cuobjdump --dump-sass.

From a practical standpoint that is actually a good thing, in that the performance of real-life double-precision code on “DP starved” consumer GPUs is rarely as bad as the theoretical throughput (1/32 of SP) would suggest.

Feng17 · October 27, 2016, 10:08pm

It’s generally not a good idea to run performance analysis with -O0 or anything less than full optimization.

I know why you did it here (to prevent the compiler from optimizing your for loop with a multiplication) but there may be other important optimizations being done (e.g. register scheduling) that occur during the optimization phases that you have prevented with -O0

Instead, try and construct a loop in your kernel code that produces a stream of add instructions without resorting to -O0

Also, I note that you’ve dumped the PTX, but what matters is the SASS. Your compile command is creating cc 2.0 PTX code (and SASS code) but when run on a cc6.1 GPU, the cc 2.0 PTX will be forward JIT-compiled to create cc 6.1 SASS code. So the PTX code may or may not bear any resemblance to what is actually running on the GPU. Instead compile for a cc 6.1 target and inspect the SASS, to determine what is actually running.

Alternatively you could analyze your code in the visual profiler, and see what the stall reasons are.

Hi txbob, thank you very much for your answer! Now I have ommited the -O0 option, recompiled the code, and then used the command cuobjdump dump-sass to get the SASS code:

Fatbin elf code:
================
arch = sm_20
code version = [1,7]
producer = <unknown>
host = windows
compile_size = 64bit

	code for sm_20

Fatbin elf code:
================
arch = sm_20
code version = [1,7]
producer = cuda
host = windows
compile_size = 64bit

	code for sm_20
		Function : _Z6VecAddPdS_S_
	.headerflags    @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
        /*0000*/         MOV R1, c[0x1][0x100];                /* 0x2800440400005de4 */
        /*0008*/         S2R R0, SR_CTAID.X;                   /* 0x2c00000094001c04 */
        /*0010*/         S2R R2, SR_TID.X;                     /* 0x2c00000084009c04 */
        /*0018*/         IMAD R0, R0, c[0x0][0x8], R2;         /* 0x2004400020001ca3 */
        /*0020*/         ISETP.GT.AND P0, PT, R0, 0x3e7, PT;   /* 0x1a0ec00f9c01dc23 */
        /*0028*/     @P0 EXIT;                                 /* 0x80000000000001e7 */
        /*0030*/         MOV32I R2, 0x8;                       /* 0x1800000020009de2 */
        /*0038*/         ISET.LT.AND R6, R0, RZ, PT;           /* 0x108e0000fc019c23 */
        /*0040*/         MOV R3, RZ;                           /* 0x28000000fc00dde4 */
        /*0048*/         IMAD R12.CC, R0, R2, c[0x0][0x28];    /* 0x20058000a0031ca3 */
        /*0050*/         MOV32I R7, 0xffffd8f0;                /* 0x1bffff63c001dde2 */
        /*0058*/         IMAD.HI.X R13, R0, R2, c[0x0][0x2c];  /* 0x20848000b0035ce3 */
        /*0060*/         IMAD R10.CC, R0, R2, c[0x0][0x20];    /* 0x2005800080029ca3 */
        /*0068*/         LD.E.64 R4, [R12];                    /* 0x8400000000c11ca5 */
        /*0070*/         IMAD.HI.X R11, R0, R2, c[0x0][0x24];  /* 0x208480009002dce3 */
        /*0078*/         MOV R2, RZ;                           /* 0x28000000fc009de4 */
        /*0080*/         LD.E.64 R8, [R10];                    /* 0x8400000000a21ca5 */
        /*0088*/         DADD R4, R8, R4;                      /* 0x4800000010811c01 */
        /*0090*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*0098*/         IADD R7, R7, 0x32;                    /* 0x4800c000c871dc03 */
        /*00a0*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*00a8*/         ISETP.NE.AND P0, PT, R7, RZ, PT;      /* 0x1a8e0000fc71dc23 */
        /*00b0*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*00b8*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*00c0*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*00c8*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*00d0*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*00d8*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*00e0*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*00e8*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*00f0*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*00f8*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*0100*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*0108*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*0110*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*0118*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*0120*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*0128*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*0130*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*0138*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*0140*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*0148*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*0150*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*0158*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*0160*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*0168*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*0170*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*0178*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*0180*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*0188*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*0190*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*0198*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*01a0*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*01a8*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*01b0*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*01b8*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*01c0*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*01c8*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*01d0*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*01d8*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*01e0*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*01e8*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*01f0*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*01f8*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*0200*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*0208*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*0210*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*0218*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*0220*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*0228*/         DADD R2, R4, R2;                      /* 0x4800000008409c01 */
        /*0230*/     @P0 BRA 0x90;                             /* 0x4003fff9600001e7 */
        /*0238*/         SHR.U32 R4, R0, 0x1d;                 /* 0x5800c00074011c03 */
        /*0240*/         MOV R7, c[0x0][0x30];                 /* 0x28004000c001dde4 */
        /*0248*/         ISCADD R5, R6, R4, 0x3;               /* 0x4000000010615c63 */
        /*0250*/         IMAD.U32.U32 R4.CC, R0, 0x8, R7;      /* 0x200fc00020011c03 */
        /*0258*/         IADD.X R5, R5, c[0x0][0x34];          /* 0x48004000d0515c43 */
        /*0260*/         ST.E.64 [R4], R2;                     /* 0x9400000000409ca5 */
        /*0268*/         EXIT;                                 /* 0x8000000000001de7 */
		................................

However, I wonder why it still show arch = sm_20? shouldn’t this be arch = sm_61?
And the float version is still only 9 to 10 times faster than the double version.
Then I looked into the visual profiler, it shows that for each cudalaunch float version is 11 to 12 times faster than double version. Maybe as njuffa said, we can rarely find the performance ratio of float/double comes close to the theoretical ratio of the throughput of float/double operations.

Feng17 · October 27, 2016, 10:11pm

In addition to what txbob said, even with fully optimized code you will rarely find floating-point intensive code where the performance ratio of float/double comes close to the theoretical ratio of the throughput of float/double operations.

The typical scenario is that the double-precision version is very close to completely limited by the low throughput of DP instructions (one might observe ~95% of the theoretical DP FLOPS on such code), while the single-precision version is limited by any number of different throughput limits and stalls.

For one, real-life floating-point intensive code tends to have many more non-floating-point instructions than one might think (e.g. integer arithmetic for address computations, control flow such as loops, loads and stores), often about half of the instructions are not in the floating-point class. This is easy to see from looking at the machine code (SASS) with cuobjdump --dump-sass.

From a practical standpoint that is actually a good thing, in that the performance of real-life double-precision code on “DP starved” consumer GPUs is rarely as bad as the theoretical throughput (1/32 of SP) would suggest.

Thank you for your reply very much! I have learned a lot from it. As texbob suggested, I used visual profiler, which shows that for each cudalaunch float version is 11 to 12 times faster than double version. As you said, it may be a good thing:)

njuffa · October 27, 2016, 10:25pm

If you want to build for sm_61, you need to tell the compiler, as it defaults to the lowest supported architecture, which is sm_20. For example, build with

nvcc -arch=sm_61 -o [my_executable] [my_source]

Feng17 · October 27, 2016, 10:41pm

Hi njuffa, thank you for your reply:) I have tried -arch=sm_61, which does not change the speeds for both versions. What makes me wondering is that txbob said “but when run on a cc6.1 GPU, the cc 2.0 PTX will be forward JIT-compiled to create cc 6.1 SASS code.”. So I expect that the SASS code that I got from the 2.0 PTX will show “arch = sm_61”, however, it still shows “arch = sm_20”.

njuffa · October 27, 2016, 11:02pm

If you build with

nvcc -arch=sm_61 -o [my_executable] [my_source]

and then run

cuobjdump --dump-sass [my_executable]

you will see SASS for sm_61. If not, you are probably not building what you think you are building, or looking at the wrong files.

If you use default compilation, you will get SASS and PTX for sm_20, and the CUDA driver will JIT compile the sm_20 PTX to sm_61 SASS when you are running with an sm_61 GPU, but you never get to see the SASS. The SASS from the JIT compiler will go into the JIT cache, from which you could dump the code, but how to find and extract code from the JIT cache would really get the conversation here off into the weeds.

Feng17 · October 28, 2016, 6:16am

If you build with
nvcc -arch=sm_61 -o [my_executable] [my_source]
and then run
cuobjdump --dump-sass [my_executable]
you will see SASS for sm_61. If not, you are probably not building what you think you are building, or looking at the wrong files.

If you use default compilation, you will get SASS and PTX for sm_20, and the CUDA driver will JIT compile the sm_20 PTX to sm_61 SASS when you are running with an sm_61 GPU, but you never get to see the SASS. The SASS from the JIT compiler will go into the JIT cache, from which you could dump the code, but how to find and extract code from the JIT cache would really get the conversation here off into the weeds.

Hi, njuffa, thank your very much for your reply! Now it is very clear:)