Under the following configuration:
HW: Tesla K20
OS: Fedora 21 - 3.18.7-200.fc21.x86_64 kernel
Nvidia driver: 346.35
Cuda: 6.5
We did notice that basically any blocking synchronization spins the CPU as if it was implemented internally in your driver with a busy wait like construct. We could observe this with both cudaStreamSynchronize() and cudaEventSynchronize(). Of course it renders the particular system useless for compute since it increases the CPU load to an unbearable level. Under Windows, we don’t see the same problem.
Here you are with a few lines of dummy code which can reproduce the issue:
#include <stdio.h>
#include <stdint.h>
#include <signal.h>
#include <time.h>
#include <unistd.h>
#include <cuda_runtime.h>
#define BUFFER_SIZE (1u << 27) /* 100 MiB */
#define CUDA_CHECK(x) \
do \
{ \
if (x != cudaSuccess) \
return -1; \
} while(0)
static bool stop = false;
static void handler(int signal)
{
stop = true;
}
static double now()
{
struct timespec time = {0};
clock_gettime(CLOCK_MONOTONIC_RAW, &time);
return (time.tv_sec * 1000000000 + time.tv_nsec) / 1000000000.0;
}
int main(int argc, char **argv)
{
/* Handle SIGINT */
{
printf("Handle SIGINT signals");
struct sigaction act = {0};
act.sa_handler = handler;
if (sigaction(SIGINT, &act, NULL))
return -1;
}
/* Initialise CUDA */
{
int numDevices;
CUDA_CHECK(cudaGetDeviceCount(&numDevices));
printf("Found %d CUDA capable devices\n", numDevices);
if (numDevices < 1)
{
printf("No devices found, exiting.");
return -1;
}
CUDA_CHECK(cudaSetDevice(0));
}
/* Make a stream and allocate some buffers */
cudaStream_t stream;
uint8_t *inputHost1, *inputHost2, *outputHost;
uint8_t *inputDevice1, *inputDevice2, *outputDevice;
{
printf("Creating stream\n");
CUDA_CHECK(cudaStreamCreate(&stream));
printf("Allocating host buffers\n");
CUDA_CHECK(cudaMallocHost(&inputHost1, BUFFER_SIZE));
CUDA_CHECK(cudaMallocHost(&inputHost2, BUFFER_SIZE));
CUDA_CHECK(cudaMallocHost(&outputHost, BUFFER_SIZE));
printf("Allocating device buffers\n");
CUDA_CHECK(cudaMalloc(&inputDevice1, BUFFER_SIZE));
CUDA_CHECK(cudaMalloc(&inputDevice2, BUFFER_SIZE));
CUDA_CHECK(cudaMalloc(&outputDevice, BUFFER_SIZE));
}
/* Initialise input data */
printf("Initialising data\n");
for (size_t index = 0; index < BUFFER_SIZE; index++)
{
/* I know, I know, this is garbage */
inputHost1[index] = index % 10;
inputHost2[index] = ((index + 2) >> 1) % 10;
}
/* Work loop */
double memcpyTime = 0.0;
double syncTime = 0.0;
double start = now();
double previous = start;
double current;
uint64_t iterations = 0;
printf("Starting work\n");
while (stop == false)
{
/* Upload data */
CUDA_CHECK(cudaMemcpyAsync(
inputDevice1, inputHost1, BUFFER_SIZE,
cudaMemcpyDeviceToHost, stream));
CUDA_CHECK(cudaMemcpyAsync(
inputDevice2, inputHost2, BUFFER_SIZE,
cudaMemcpyDeviceToHost, stream));
/* Launch kernel */
/* Download data */
CUDA_CHECK(cudaMemcpyAsync(
outputHost, outputDevice, BUFFER_SIZE,
cudaMemcpyHostToDevice, stream));
current = now();
memcpyTime += current - previous;
previous = current;
/* the spinning on the CPU coccurs here */
CUDA_CHECK(cudaStreamSynchronize(stream));
/* this is a viable work-around for the problem, but we'd prefer to see a driver-fix */
//while (cudaStreamQuery(stream) != cudaSuccess)
// usleep(100);
current = now();
syncTime += current - previous;
previous = current;
iterations++;
}
double end = now();
/* Print timing */
printf("Spent %lf seconds in cudaMemcpyAsync()\n", memcpyTime);
printf("Spent %lf seconds in cudaStreamSynchronize()\n", syncTime);
printf("Ran at a speed of %lf iterations/second\n", iterations/(end - start));
/* Verify output data */
printf("Veryfying data\n");
for (size_t index = 0; index < BUFFER_SIZE; index++)
{
if (outputHost[index] != inputHost1[index] + inputHost2[index])
{
printf("INVALID RESULT at index %llu!\n", index);
break;
}
}
/* Cleanup stream and buffers */
{
printf("Freeing device buffers\n");
CUDA_CHECK(cudaFree(inputDevice1));
CUDA_CHECK(cudaFree(inputDevice2));
CUDA_CHECK(cudaFree(outputDevice));
printf("Freeing host buffers\n");
CUDA_CHECK(cudaFreeHost(inputHost1));
CUDA_CHECK(cudaFreeHost(inputHost2));
CUDA_CHECK(cudaFreeHost(outputHost));
printf("Destroying stream\n");
CUDA_CHECK(cudaStreamDestroy(stream));
}
return 0;
}
Which driver version would you recommend us to use, and what is your ETA to reduce your driver’s CPU load to a reasonable level - in case it has not been fixed already?
You can find a viable workaround as a comment in the code as well.
Thank you very much in advance for your co-operation.