I (still) have a problem where (random) global memory lookups in buffers > 1GB have a severe performance impact on the achieved bandwidth. The program below creates a 1GB buffer and fills it with random numbers. The kernel then loads 128 * 128 coalesced bytes from random locations per thread. It is a simplified version of a crypto currency hashing alogorhytm (Ethereum).
When the buffer is exactly 1GB, the bandwidth is about 70GB/s, close to peak bandwidth on GTX750Ti. But when it is increased to i.e. 1152MB, 1280MB or 1536MB, the bandwidth drastically drops (exponentially, I think). On GTX780, I get about 135GB/s regardless of the buffer size.
I have strong reasons to believe that this doesn’t happen on GTX750Ti/Linux (field reports from my Ethereum miner that has the same behaviour), but I’ll have to double-check that. If anyone wants to try this out on Linux, that would save me a lot of hassle ;).
What causes this, is it a bug?
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <stdint.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define GRID_SIZE 8192
#define BLOCK_SIZE 256
#define BUFFER_SIZE (1024 * 1024 * 1024) // <---- change buffer size here!
#define THREADS_PER_HASH 8
#define ITERATIONS 16
#define FNV_PRIME 0x01000193
#define fnv(x,y) ((x) * FNV_PRIME ^(y))
#define random() (rand() * rand()) // <---- RAND_MAX on Win is 32767
#define CUDA_SAFE_CALL(call) \
do { \
cudaError_t err = call; \
if (cudaSuccess != err) { \
fprintf(stderr, "Cuda error in func '%s' at line %i : %s.\n", \
__FUNCTION__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
typedef union
{
uint32_t uint32s[128 / sizeof(uint32_t)];
uint4 uint4s[128 / sizeof(uint4)];
} hash128_t;
__constant__ hash128_t * d_buffer;
__constant__ unsigned int d_buffer_size;
__device__ uint32_t fnv_reduce(uint4 v)
{
return fnv(fnv(fnv(v.x, v.y), v.z), v.w);
}
__global__ void test(int search, volatile unsigned int * num_results)
{
const unsigned int gid = blockIdx.x * blockDim.x + threadIdx.x;
const int thread_id = threadIdx.x & (THREADS_PER_HASH - 1);
const int start_lane = threadIdx.x & ~(THREADS_PER_HASH - 1);
unsigned int s = gid;
unsigned int r;
for (int i = 0; i < THREADS_PER_HASH; i++) {
for (int j = 0; j < ITERATIONS; j++) {
unsigned int index = __shfl(s, start_lane + i);
uint4 v = d_buffer[index % d_buffer_size].uint4s[thread_id];
s = fnv_reduce(v);
}
s = __shfl(s, start_lane + i);
if (i == thread_id) {
r = s;
}
}
if (search == r) {
atomicInc(const_cast<unsigned int *>(num_results), UINT_MAX);
__threadfence_system();
}
}
int main()
{
unsigned int * buffer = (unsigned int *)malloc(BUFFER_SIZE);
printf("Creating buffer of size %u bytes...\n", BUFFER_SIZE);
srand(time(NULL));
for (unsigned int i = 0; i < BUFFER_SIZE / 4; i++) {
buffer[i] = random();
}
hash128_t * h_buffer;
volatile unsigned int * num_results;
unsigned int h_buffer_size = BUFFER_SIZE / sizeof(hash128_t);
CUDA_SAFE_CALL(cudaSetDevice(0));
CUDA_SAFE_CALL(cudaMallocHost((void**)&num_results, sizeof(unsigned int)));
CUDA_SAFE_CALL(cudaMalloc((void**)&h_buffer, BUFFER_SIZE));
CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(h_buffer), buffer, BUFFER_SIZE, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_buffer, &h_buffer, sizeof(hash128_t *)));
CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_buffer_size, &h_buffer_size, sizeof(unsigned int)));
num_results[0] = 0;
unsigned int target;
target = random();
cudaEvent_t start, stop;
CUDA_SAFE_CALL(cudaEventCreate(&start));
CUDA_SAFE_CALL(cudaEventCreate(&stop));
cudaEventRecord(start, nullptr);
int count = 16;
for (int i = 0; i < count; i++) {
test << <GRID_SIZE, BLOCK_SIZE >> >(target, num_results);
CUDA_SAFE_CALL(cudaGetLastError());
CUDA_SAFE_CALL(cudaDeviceSynchronize());
}
cudaEventRecord(stop, nullptr);
cudaFree(h_buffer);
float duration;
cudaEventElapsedTime(&duration, start, stop);
printf("%f GB/s\n", (1000.0f / duration) * count * sizeof(uint4) * THREADS_PER_HASH * ITERATIONS * GRID_SIZE * BLOCK_SIZE / static_cast<float>(1 << 30));
return num_results[0];
}