The Programming Guide says that __syncthreads throughput is “16 operations per clock cycle for devices of compute capability 2.x” (section 5.4.3.)
Is that per SM? I’m getting at most 8 operations per clock cycle on a Fermi GT-430. This is the code:
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
#define CUDA_CHECK_ERROR( call) do { \
cudaError err = (call); \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} } while (0)
__shared__ long long int cl[2];
__global__ void no_op()
{
cl[0] = clock64();
__syncthreads();
__syncthreads();
__syncthreads();
__syncthreads();
__syncthreads();
__syncthreads();
__syncthreads();
__syncthreads();
__syncthreads();
if(threadIdx.x == 0) {
cl[1] = clock64();
for(int i = 0; i < 2; i++)
printf("%lld\n", cl[i]);
}
}
int main()
{
no_op<<<1, 32*32>>>();
CUDA_CHECK_ERROR(cudaGetLastError());
CUDA_CHECK_ERROR(cudaThreadSynchronize());
return 0;
}
It prints:
15667992
15669252
Which is 1260 clock cycles for about 9500 __syncthreads; I.e. about 8/cycle.
EDIT: with full occupancy it goes up to about 11/cycle.