Low performance for convolution in cuDNN on Tesla V100
I am testing Tesla V100 using CUDA 9 and cuDNN 7 (on Windows 10). I measured good performance for cuBLAS ~90 Tflops on matrix multiplication. However, in cuDNN I measured only low performance and no advantage of tensor cores on V100. For example, the following code shows only ~14 Tflops. It is unacceptable taking into account NVIDIA's marketing promises and the price of V100. Is there parameter settings when Tesla V100 has benefit from using tensor cores in cuDNN when computing convolution? //-------------------------------------------------------- void Check(cudnnStatus_t status) { if (status != CUDNN_STATUS_SUCCESS) { fprintf(stderr, "!!!! GPU program execution error: %s\n", cudnnGetErrorString(status)); exit(0); } } void Check(cudaError_t status) { if (status != cudaSuccess) { fprintf(stderr, "!!!! CUDA error\n"); exit(0); } } size_t Get_size(cudnnTensorDescriptor_t tensor_desc) { size_t size = 0; Check(cudnnGetTensorSizeInBytes(tensor_desc, &size)); return size; } void* Alloc(size_t size) { if (size == 0) return 0; void* x = 0; cudaMalloc(&x, size); return x; } void main() { cudnnHandle_t cudnn; cudnnCreate(&cudnn); const int c_in = 128; const int c_out = 128; const int xh = 128; const int xw = 128; const int n = 128; const int fh = 4; const int fw = 4; float alpha = 0.1f, beta = 0.1f; cudnnTensorFormat_t data_format = CUDNN_TENSOR_NCHW; cudnnDataType_t data_type = CUDNN_DATA_HALF; cudnnTensorDescriptor_t x_desc; Check(cudnnCreateTensorDescriptor(&x_desc)); Check(cudnnSetTensor4dDescriptor(x_desc, data_format, data_type, n, c_in, xh, xw)); cudnnFilterDescriptor_t f_desc; Check(cudnnCreateFilterDescriptor(&f_desc)); Check(cudnnSetFilter4dDescriptor(f_desc, data_type, data_format, c_out, c_in, fh, fw)); cudnnConvolutionDescriptor_t conv_desc; Check(cudnnCreateConvolutionDescriptor(&conv_desc)); Check(cudnnSetConvolution2dDescriptor( conv_desc, 0, //0, //int pad_h, 0, //0, //int pad_w, 1, //int u, 1, //int v, 1, //0, // dilation_h, 1, //0, // dilation_w, CUDNN_CONVOLUTION, CUDNN_DATA_FLOAT)); int n_, c_, h_, w_; Check(cudnnGetConvolution2dForwardOutputDim(conv_desc, x_desc, f_desc, &n_, &c_, &h_, &w_)); float Gflop = (2.0f * (float)n_ * (float)h_ * (float)w_ * (float)c_out * (float)c_in * (float)fh * (float)fw) / (1000.0f * 1000.0f * 1000.0f); printf("n_=%d c_=%d h_=%d w_=%d, Gflop=%f\n", n_, c_, h_, w_, Gflop); cudnnTensorDescriptor_t y_desc; Check(cudnnCreateTensorDescriptor(&y_desc)); Check(cudnnSetTensor4dDescriptor(y_desc, data_format, data_type, n_, c_, h_, w_)); cudnnConvolutionFwdAlgo_t algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; size_t tmp_size = 0; Check(cudnnGetConvolutionForwardWorkspaceSize(cudnn, x_desc, f_desc, conv_desc, y_desc, algo, &tmp_size)); size_t f_size = (size_t)c_out * (size_t)c_in * (size_t)fh * (size_t)fw; printf("tmp_size: %f MB\n", tmp_size * 1.0f / (1024.0*1024.0)); printf("x_size: %f MB\n", Get_size(x_desc) * 1.0f / (1024.0*1024.0)); printf("y_size: %f MB\n", Get_size(y_desc) * 1.0f / (1024.0*1024.0)); printf("f_size: %f MB\n", f_size * 1.0f / (1024.0*1024.0)); void* x = Alloc(Get_size(x_desc)); void* f = Alloc(f_size); void* y = Alloc(Get_size(y_desc)); void* t = Alloc(tmp_size); void* y2 = Alloc(Get_size(y_desc)); printf("x = %p, f = %p, y = %p, t = %p\n", x, f, y, t); const int Q = 4; cudaStream_t stream[Q]; for (int q = 0; q < Q; q++) Check(cudaStreamCreate(&(stream[q]))); double ti = second(); int U = 10; for (int u = 0; u < U; u++) { for (int q = 0; q < Q; q++) { Check(cudnnSetStream(cudnn, stream[q])); Check(cudnnConvolutionForward(cudnn, &alpha, x_desc, x, f_desc, f, conv_desc, algo, t, tmp_size, &beta, y_desc, y)); } } for (int q = 0; q < Q; q++) Check(cudaStreamSynchronize(stream[q])); float secs = (float)(second() - ti); float gflops = (Gflop * U * Q) / secs; printf("secs: %f, Tflops: %f\n", secs, gflops / 1000.0f); }
I am testing Tesla V100 using CUDA 9 and cuDNN 7 (on Windows 10).

I measured good performance for cuBLAS ~90 Tflops on matrix multiplication. However, in cuDNN I measured only low performance and no advantage of tensor cores on V100.

For example, the following code shows only ~14 Tflops. It is unacceptable taking into account NVIDIA's marketing promises and the price of V100.

Is there parameter settings when Tesla V100 has benefit from using tensor cores in cuDNN when computing convolution?

//--------------------------------------------------------

void Check(cudnnStatus_t status)
{
if (status != CUDNN_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! GPU program execution error: %s\n", cudnnGetErrorString(status));
exit(0);
}
}

void Check(cudaError_t status)
{
if (status != cudaSuccess)
{
fprintf(stderr, "!!!! CUDA error\n");
exit(0);
}
}

size_t Get_size(cudnnTensorDescriptor_t tensor_desc)
{
size_t size = 0;
Check(cudnnGetTensorSizeInBytes(tensor_desc, &size));
return size;
}

void* Alloc(size_t size)
{
if (size == 0) return 0;

void* x = 0;
cudaMalloc(&x, size);
return x;
}

void main()
{
cudnnHandle_t cudnn;
cudnnCreate(&cudnn);

const int c_in = 128;
const int c_out = 128;

const int xh = 128;
const int xw = 128;
const int n = 128;

const int fh = 4;
const int fw = 4;

float alpha = 0.1f, beta = 0.1f;

cudnnTensorFormat_t data_format = CUDNN_TENSOR_NCHW;

cudnnDataType_t data_type = CUDNN_DATA_HALF;

cudnnTensorDescriptor_t x_desc;
Check(cudnnCreateTensorDescriptor(&x_desc));
Check(cudnnSetTensor4dDescriptor(x_desc, data_format, data_type, n, c_in, xh, xw));

cudnnFilterDescriptor_t f_desc;
Check(cudnnCreateFilterDescriptor(&f_desc));
Check(cudnnSetFilter4dDescriptor(f_desc, data_type, data_format, c_out, c_in, fh, fw));

cudnnConvolutionDescriptor_t conv_desc;
Check(cudnnCreateConvolutionDescriptor(&conv_desc));

Check(cudnnSetConvolution2dDescriptor(
conv_desc,
0, //0, //int pad_h,
0, //0, //int pad_w,
1, //int u,
1, //int v,
1, //0, // dilation_h,
1, //0, // dilation_w,
CUDNN_CONVOLUTION,
CUDNN_DATA_FLOAT));

int n_, c_, h_, w_;
Check(cudnnGetConvolution2dForwardOutputDim(conv_desc, x_desc, f_desc, &n_, &c_, &h_, &w_));

float Gflop = (2.0f * (float)n_ * (float)h_ * (float)w_ * (float)c_out * (float)c_in * (float)fh * (float)fw) / (1000.0f * 1000.0f * 1000.0f);
printf("n_=%d c_=%d h_=%d w_=%d, Gflop=%f\n", n_, c_, h_, w_, Gflop);

cudnnTensorDescriptor_t y_desc;
Check(cudnnCreateTensorDescriptor(&y_desc));
Check(cudnnSetTensor4dDescriptor(y_desc, data_format, data_type, n_, c_, h_, w_));

cudnnConvolutionFwdAlgo_t algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;

size_t tmp_size = 0;
Check(cudnnGetConvolutionForwardWorkspaceSize(cudnn, x_desc, f_desc, conv_desc, y_desc, algo, &tmp_size));

size_t f_size = (size_t)c_out * (size_t)c_in * (size_t)fh * (size_t)fw;

printf("tmp_size: %f MB\n", tmp_size * 1.0f / (1024.0*1024.0));
printf("x_size: %f MB\n", Get_size(x_desc) * 1.0f / (1024.0*1024.0));
printf("y_size: %f MB\n", Get_size(y_desc) * 1.0f / (1024.0*1024.0));
printf("f_size: %f MB\n", f_size * 1.0f / (1024.0*1024.0));

void* x = Alloc(Get_size(x_desc));
void* f = Alloc(f_size);
void* y = Alloc(Get_size(y_desc));
void* t = Alloc(tmp_size);
void* y2 = Alloc(Get_size(y_desc));

printf("x = %p, f = %p, y = %p, t = %p\n", x, f, y, t);

const int Q = 4;
cudaStream_t stream[Q];
for (int q = 0; q < Q; q++) Check(cudaStreamCreate(&(stream[q])));

double ti = second();

int U = 10;
for (int u = 0; u < U; u++)
{
for (int q = 0; q < Q; q++)
{
Check(cudnnSetStream(cudnn, stream[q]));
Check(cudnnConvolutionForward(cudnn, &alpha, x_desc, x, f_desc, f,
conv_desc, algo, t, tmp_size, &beta, y_desc, y));
}
}

for (int q = 0; q < Q; q++) Check(cudaStreamSynchronize(stream[q]));
float secs = (float)(second() - ti);

float gflops = (Gflop * U * Q) / secs;
printf("secs: %f, Tflops: %f\n", secs, gflops / 1000.0f);
}

#1
Posted 12/06/2017 06:32 PM   
Later, I found that the low performance was because I forgot to enable tensor cores. There is a need to call explicitly cudnnSetConvolutionMathType(conv_desc, CUDNN_TENSOR_OP_MATH) in order to allow to use tensor cores. By default, cuDNN forces to use legacy fp32. After calling cudnnSetConvolutionMathType(conv_desc, CUDNN_TENSOR_OP_MATH), I measured ~90Tflops for convolutions. Taking into account, that I used PCIe version of V100 with 112 Tflops for boost clock and 102 Tflops for base clock, I think it is very good result. So, good job from NVIDIA :) //------------------------------------------------------------------ ... Check(cudnnCreateConvolutionDescriptor(&conv_desc)); Check(cudnnSetConvolution2dDescriptor( conv_desc, 0, //0, //int pad_h, 0, //0, //int pad_w, 1, //int u, 1, //int v, 1, //0, // dilation_h, 1, //0, // dilation_w, CUDNN_CONVOLUTION, //CUDNN_CROSS_CORRELATION, CUDNN_CONVOLUTION, CUDNN_DATA_FLOAT));// CUDNN_DATA_FLOAT)); CUDNN_DATA_HALF [b]Check(cudnnSetConvolutionMathType(conv_desc, CUDNN_TENSOR_OP_MATH));[/b] int n_, c_, h_, w_; Check(cudnnGetConvolution2dForwardOutputDim(conv_desc, x_desc, f_desc, &n_, &c_, &h_, &w_)); ...
Later, I found that the low performance was because I forgot to enable tensor cores. There is a need to call explicitly cudnnSetConvolutionMathType(conv_desc, CUDNN_TENSOR_OP_MATH) in order to allow to use tensor cores. By default, cuDNN forces to use legacy fp32.

After calling cudnnSetConvolutionMathType(conv_desc, CUDNN_TENSOR_OP_MATH), I measured ~90Tflops for convolutions.

Taking into account, that I used PCIe version of V100 with 112 Tflops for boost clock and 102 Tflops for base clock, I think it is very good result. So, good job from NVIDIA :)

//------------------------------------------------------------------
...

Check(cudnnCreateConvolutionDescriptor(&conv_desc));

Check(cudnnSetConvolution2dDescriptor(
conv_desc,
0, //0, //int pad_h,
0, //0, //int pad_w,
1, //int u,
1, //int v,
1, //0, // dilation_h,
1, //0, // dilation_w,
CUDNN_CONVOLUTION, //CUDNN_CROSS_CORRELATION, CUDNN_CONVOLUTION,
CUDNN_DATA_FLOAT));// CUDNN_DATA_FLOAT)); CUDNN_DATA_HALF

Check(cudnnSetConvolutionMathType(conv_desc, CUDNN_TENSOR_OP_MATH));

int n_, c_, h_, w_;
Check(cudnnGetConvolution2dForwardOutputDim(conv_desc, x_desc, f_desc, &n_, &c_, &h_, &w_));

...

#2
Posted 12/07/2017 12:35 PM   
Scroll To Top

Add Reply