My last post got me wondering if the L2 cache was any benefit to writes that cannot be coalesced at the warp level, but could be approximately coalesced at a higher level. To test the idea, I wrote a little kernel that had each thread (16M total) write an integer to output index = threadIdx.x * gridDim.x + blockIdx.x. This requires 16M memory transactions to process, but also has lots of opportunities for combining writes from different blocks if you get lucky and have consecutive blocks writing to the same cache line at the same time. Results:
Device name: GeForce GTX 275
BogoGFLOPS: 699.8
Size of array: 16776960 elements
No coalesce: 21.145 ms, 3026.6 MB/sec
(Edit: fixed device name and BogoGFLOPS that I retyped wrong and aviday spotted below.)
Device name: GeForce GTX 470
BogoGFLOPS: 1088.6
Size of array: 16776960 elements
No coalesce: 6.492 ms, 9857.6 MB/sec
Not bad. A 3.25x improvement between devices with approximately the same theoretical memory bandwidth. Looks like writes can be combined in the L2 cache.
I changed the original code a bit to do simple bandwidth test - I hope its ok with you seibert :). The results are somewhat interesting and dissapointing at the same time :) (at least for me)…
/* -*- c++ -*- */
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
/* Yes, I'm going to hell. */
# define CUDA_CHECK_ERROR( call) do { \
cudaError err = call; \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} } while (0)
///////////////// Kernels ///////////////////////
#define THREADS_PER_BLOCK 256
__global__ void no_coalesce(float *output)
{
unsigned int index = blockIdx.x * blockDim.x + threadIdx.x;
// Transpose the block and thread ID to break coalescing
// maximally
unsigned int write_index = threadIdx.x * gridDim.x + blockIdx.x;
output[write_index] = index * 1.1f;
}
__global__ void bw_test_straight( float *input, float *output )
{
unsigned int index = blockIdx.x * blockDim.x + threadIdx.x;
output[index] = input[index] * 1.1f;
}
__global__ void bw_test_transpose( float *input, float *output )
{
unsigned int index = blockIdx.x * blockDim.x + threadIdx.x;
// Transpose the block and thread ID to break coalescing
// maximally
unsigned int write_index = threadIdx.x * gridDim.x + blockIdx.x;
output[write_index] = input[ index ]* 1.1f;
}
////////////////// Host code ///////////
int main()
{
const unsigned int blocks = 65535;
const unsigned int n = blocks * THREADS_PER_BLOCK;
// Device information
int device;
CUDA_CHECK_ERROR(cudaGetDevice(&device));
struct cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
int proc_per_multiproc = 8;
if (prop.major == 2) proc_per_multiproc = 32;
printf("Device name: %s\n", prop.name);
// Bogus normalization metric
float bogogflops = 2 * prop.clockRate * prop.multiProcessorCount * proc_per_multiproc / 1e6;
printf("BogoGFLOPS: %1.1f\n\n", bogogflops);
printf("Size of array: %d elements\n", n);
// Allocate arrays
float *d_output, *d_input;
float *h_input = new float[n];
for ( int i = 0; i < n; i++ ) h_input[ i ] = i / 5.555f;
cudaMalloc( ( void ** )&( d_input ), sizeof(float) * n );
cudaMemcpy( d_input, h_input, sizeof(float) * n, cudaMemcpyHostToDevice );
CUDA_CHECK_ERROR(cudaMalloc((void **) &d_output, sizeof(float) * n));
// Warmup
no_coalesce<<<blocks, THREADS_PER_BLOCK>>>(d_output);
bw_test_straight<<< blocks, THREADS_PER_BLOCK>>>(d_input, d_output);
bw_test_transpose<<< blocks, THREADS_PER_BLOCK>>>(d_input, d_output);
CUDA_CHECK_ERROR(cudaThreadSynchronize());
cudaEvent_t start, stop, start1, stop1, start2, stop2;
float elapsedTime, elapsedTime1, elapsedTime2;
CUDA_CHECK_ERROR(cudaEventCreate(&start));
CUDA_CHECK_ERROR(cudaEventCreate(&stop));
CUDA_CHECK_ERROR(cudaEventCreate(&start1));
CUDA_CHECK_ERROR(cudaEventCreate(&stop1));
CUDA_CHECK_ERROR(cudaEventCreate(&start2));
CUDA_CHECK_ERROR(cudaEventCreate(&stop2));
CUDA_CHECK_ERROR(cudaEventRecord(start, 0));
no_coalesce<<<blocks, THREADS_PER_BLOCK>>>(d_output);
CUDA_CHECK_ERROR(cudaEventRecord(stop, 0));
CUDA_CHECK_ERROR(cudaEventSynchronize(stop));
CUDA_CHECK_ERROR(cudaEventElapsedTime(&elapsedTime, start,stop));
// Compute effective memory bandwidth.
printf("No coalesce: %1.3f ms, %1.1f MB/sec\n", elapsedTime,
((float) n / elapsedTime * 1e3 / 1024.0 / 1024.0 * 4));
CUDA_CHECK_ERROR(cudaEventRecord(start1, 0));
bw_test_straight<<<blocks, THREADS_PER_BLOCK>>>(d_input, d_output);
CUDA_CHECK_ERROR(cudaEventRecord(stop1, 0));
CUDA_CHECK_ERROR(cudaEventSynchronize(stop1));
CUDA_CHECK_ERROR(cudaEventElapsedTime(&elapsedTime1, start1,stop1));
printf("bw_test_straight: %1.3f ms, %1.1f MB/sec\n", elapsedTime1,
((float) n / elapsedTime1 * 1e3 / 1024.0 / 1024.0 * 4));
CUDA_CHECK_ERROR(cudaEventRecord(start2, 0));
bw_test_transpose<<<blocks, THREADS_PER_BLOCK>>>(d_input, d_output);
CUDA_CHECK_ERROR(cudaEventRecord(stop2, 0));
CUDA_CHECK_ERROR(cudaEventSynchronize(stop2));
CUDA_CHECK_ERROR(cudaEventElapsedTime(&elapsedTime2, start2,stop2));
printf("bw_test_transpose: %1.3f ms, %1.1f MB/sec\n", elapsedTime2,
((float) n / elapsedTime2 * 1e3 / 1024.0 / 1024.0 * 4));
return 0;
}
Those are the results for a GTX480 on linux:
Device name: GeForce GTX 480
BogoGFLOPS: 1345.0
Size of array: 16776960 elements
No coalesce: 5.585 ms, 11458.5 MB/sec
bw_test_straight: 0.947 ms, 67571.1 MB/sec
bw_test_transpose: 5.386 ms, 11882.8 MB/sec
And those for GTX295 (one half of it) on linux:
Device name: GeForce GTX 295
BogoGFLOPS: 596.2
Size of array: 16776960 elements
No coalesce: 16.828 ms, 3803.1 MB/sec
bw_test_straight: 1.378 ms, 46437.5 MB/sec
bw_test_transpose: 19.622 ms, 3261.5 MB/sec
So it seems that the original test “No coalesce” shows the same performance gain as seibert saw.
The “bw_test_transpose” also shows a great performance boost in favour of the GTX480 (~ same as the “No coalesce” test)
However the “bw_test_straight” shows ~30-40% boost from the GTX480. This is consistant with my real application boost and
is discussed in the thread I’ve mentioned above.
So for me, and maybe for all conventional BW bounded kernels, this is the top boost I can expect from Fermi.
Yes, a bandwidth-bound kernel reaching near peak memory bandwidth already is not going to see an enormous boost from Fermi. In fact the compute/memory bandwidth ratio in Fermi cards seems to be tipped more toward compute compared to previous cards.
__global__ void bw_test_straight_with_loop( float *input, float *output )
{
unsigned int index = blockIdx.x * blockDim.x + threadIdx.x;
float a = 0.0f;
for ( int i = 0; i < 1000; i++ )
{
a += input[index + i ];
}
output[index] = a * 1.1f;
}
GTX295:
Device name: GeForce GTX 295
Size of array: 16776960 elements
bw_test_straight_with_loop: 1035.089 ms, 61.8 MB/sec
GTX480:
Device name: GeForce GTX 480
Size of array: 16776960 elements
bw_test_straight_with_loop: 269.733 ms, 237.3 MB/sec
Nice :)
I’ve tested it with #pragma unroll on the loop and those are the results… I wonder which Fermi feature responsible for this.???.. :)
Device name: GeForce GTX 295
Size of array: 16776960 elements
bw_test_straight_with_loop: 1037.054 ms, 61.7 MB/sec
vs
Device name: GeForce GTX 480
Size of array: 16776960 elements
bw_test_straight_with_loop: 174.261 ms, 367.3 MB/sec