[cudnn bug] 3D Convolution failure when using large image size (GPU memory is okay)

Hello,

I encountered a weird problem when using 3D convolutions of cudnn. I feel it could be a bug in the cudnn library. If it is, hope that the bug can be fixed quickly.

Problem statement: I implemented a 3D convolution layer using cudnn. Things went smoothly if the input image is not large. However, when the size of input image increases (so is the output feature map), suddenly I met an error called “an illegal memory access was encountered” when I use cudaFree to free the allocated memory. Notice that the requested GPU memory is still smaller than the free GPU memory. So I think the GPU memory should not be a problem.

To reproduce the problem, I gave an example code below. If you use input tensor size as 1x1x576x416x282, the program runs successfully. However, if you slightly increase the tensor size, e.g., to 1x1x576x416x320, the program reports “illegal memory access” error. I guess that the problem is related to the issue of 32-bit integer potentially used in the cudnn library, because this error happens at the point when the output feature map exceeds 4 GB.

Environment:
Geforce Titan XP, Cuda 9.2, Cudnn 7.1.4, ubuntu 16.04 LTS, gcc 5.4.0, nvidia driver 396.24.02.

#include <cudnn.h>
#include <iostream>

void compute_stride(const int* size, int* stride) {
    for(int i = 4; i >= 0; i--)
        stride[i] = (i == 4) ? 1 : size[i+1] * stride[i+1];
}

int main(int argc, char** argv)
{
    cudnnHandle_t handle = nullptr;
    cudnnStatus_t status;
    status = cudnnCreate(&handle);
    if(status != CUDNN_STATUS_SUCCESS) {
        std::cerr << "fail to create cudnn handle: " << cudnnGetErrorString(status) << std::endl;
        return -1;
    }

    int in_tensor_size[] = {1, 1, 576, 416, 320};
//    int in_tensor_size[] = {1, 1, 576, 416, 282};
    int in_tensor_stride[5];
    compute_stride(in_tensor_size, in_tensor_stride);

    int conv_dilation[] = {1, 1, 1};
    int conv_pad[] = {1, 1, 1};
    int conv_stride[] = {1, 1, 1};

    int filt_size[] = {16, 1, 3, 3, 3};
    size_t filt_bytes = 1;
    for(int i = 0; i < 5; ++i)
        filt_bytes *= filt_size[i];

    // input tensor desc
    cudnnTensorDescriptor_t in_tensor_desc;
    status = cudnnCreateTensorDescriptor(&in_tensor_desc);
    if(status != CUDNN_STATUS_SUCCESS) {
        std::cerr << "fail to create tensor desc: " << cudnnGetErrorString(status) << std::endl;
        return -1;
    }

    status = cudnnSetTensorNdDescriptor(in_tensor_desc, CUDNN_DATA_FLOAT, 5, in_tensor_size, in_tensor_stride);
    if(status != CUDNN_STATUS_SUCCESS) {
        std::cerr << "fail to set tensor desc: " << cudnnGetErrorString(status) << std::endl;
        return -1;
    }

    // filter desc
    cudnnFilterDescriptor_t filt_desc;
    status = cudnnCreateFilterDescriptor(&filt_desc);
    if(status != CUDNN_STATUS_SUCCESS) {
        std::cerr << "fail to create filt desc: " << cudnnGetErrorString(status) << std::endl;
        return -1;
    }
    status = cudnnSetFilterNdDescriptor(filt_desc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 5, filt_size);
    if(status != CUDNN_STATUS_SUCCESS) {
        std::cerr << "fail to set filt desc: " << cudnnGetErrorString(status) << std::endl;
        return -1;
    }

    // conv desc
    cudnnConvolutionDescriptor_t conv_desc;
    status = cudnnCreateConvolutionDescriptor(&conv_desc);
    if(status != CUDNN_STATUS_SUCCESS) {
        std::cerr << "fail to create conv desc: " << cudnnGetErrorString(status) << std::endl;
        return -1;
    }

    status = cudnnSetConvolutionNdDescriptor(conv_desc, 3, conv_pad, conv_stride, conv_dilation, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT);
    if(status != CUDNN_STATUS_SUCCESS) {
        std::cerr << "fail to set conv desc: " << cudnnGetErrorString(status) << std::endl;
        return -1;
    }

    // out tensor desc
    int out_tensor_size[5];
    status = cudnnGetConvolutionNdForwardOutputDim(conv_desc, in_tensor_desc, filt_desc, 5, out_tensor_size);
    if(status != CUDNN_STATUS_SUCCESS) {
        std::cerr << "fail to infer output size: " << cudnnGetErrorString(status) << std::endl;
        return -1;
    }
    int out_tensor_stride[5];
    compute_stride(out_tensor_size, out_tensor_stride);

    cudnnTensorDescriptor_t out_tensor_desc;
    status = cudnnCreateTensorDescriptor(&out_tensor_desc);
    if(status != CUDNN_STATUS_SUCCESS) {
        std::cerr << "fail to create out tensor desc: " << cudnnGetErrorString(status) << std::endl;
        return -1;
    }
    status = cudnnSetTensorNdDescriptor(out_tensor_desc, CUDNN_DATA_FLOAT, 5, out_tensor_size, out_tensor_stride);
    if(status != CUDNN_STATUS_SUCCESS) {
        std::cerr << "fail to set out tensor desc: " << cudnnGetErrorString(status) << std::endl;
        return -1;
    }

    // get conv alg
    int num_alg = 0;
    cudnnConvolutionFwdAlgoPerf_t alg_perf;
    status = cudnnGetConvolutionForwardAlgorithm_v7(handle, in_tensor_desc, filt_desc, conv_desc, out_tensor_desc,
                                                    1, &num_alg, &alg_perf);
    if(status != CUDNN_STATUS_SUCCESS) {
        std::cerr << "fail to get fw algorithm: " << cudnnGetErrorString(status) << std::endl;
        return -1;
    }

    cudnnConvolutionFwdAlgo_t fwd_alg = alg_perf.algo;

    if(num_alg != 1 || status != CUDNN_STATUS_SUCCESS) {
        std::cerr << "fail to get algorithm: " << cudnnGetErrorString(status) << std::endl;
        return -1;
    }

    size_t workspace_size = 0;
    status = cudnnGetConvolutionForwardWorkspaceSize(handle, in_tensor_desc, filt_desc, conv_desc, out_tensor_desc,
                                                     fwd_alg, &workspace_size);
    if(status != CUDNN_STATUS_SUCCESS) {
        std::cerr << "fail to get forward workspace size: " << cudnnGetErrorString(status) << std::endl;
        return -1;
    }

    // allocate memory
    size_t in_bytes = 0, out_bytes = 0;
    status = cudnnGetTensorSizeInBytes(in_tensor_desc, &in_bytes);
    if(status != CUDNN_STATUS_SUCCESS) {
        std::cerr << "fail to get bytes of in tensor: " << cudnnGetErrorString(status) << std::endl;
        return -1;
    }
    status = cudnnGetTensorSizeInBytes(out_tensor_desc, &out_bytes);
    if(status != CUDNN_STATUS_SUCCESS) {
        std::cerr << "fail to get bytes of out tensor: " << cudnnGetErrorString(status) << std::endl;
        return -1;
    }

    size_t request = in_bytes + out_bytes + workspace_size + filt_bytes;
    size_t free = 0, total = 0;
    cudaError_t cuda_err = cudaMemGetInfo(&free, &total);
    if(cuda_err != cudaSuccess) {
        std::cerr << "fail to get mem info: " << cudaGetErrorString(cuda_err) << std::endl;
        return -1;
    }
    if(request > free) {
        std::cerr << "not enough gpu memory to run" << std::endl;
        return -1;
    }

    void* device_buffer = nullptr;
    cuda_err = cudaMalloc(&device_buffer, request);
    if(cuda_err != cudaSuccess) {
        std::cerr << "fail to allcoate memory: " << cudaGetErrorString(cuda_err) << std::endl;
        return -1;
    }

    void* in_tensor_ptr = device_buffer;
    void* out_tensor_ptr = (void*)((char*)(device_buffer) + in_bytes);
    void* workspace_ptr = (void*)((char*)(device_buffer) + in_bytes + out_bytes);
    void* filt_ptr = (void*)((char*)(device_buffer) + in_bytes + out_bytes + workspace_size);

    float alpha = 1.0f, beta = 0.0f;
    status = cudnnConvolutionForward(handle, &alpha,
                                     in_tensor_desc, in_tensor_ptr,
                                     filt_desc, filt_ptr,
                                     conv_desc, fwd_alg, workspace_ptr, workspace_size,
                                     &beta, out_tensor_desc, out_tensor_ptr);
    if(status != CUDNN_STATUS_SUCCESS) {
        std::cerr << "fail to forward: " << cudnnGetErrorString(status) << std::endl;
        return -1;
    }

    // clean up
    cuda_err = cudaFree(device_buffer);
    if(cuda_err != cudaSuccess) {
        std::cerr << "fail to free memory: " << cudaGetErrorString(cuda_err) << std::endl;
        return -1;
    }

    status = cudnnDestroyTensorDescriptor(out_tensor_desc);
    if(status != CUDNN_STATUS_SUCCESS) {
        std::cerr << "fail to destroy out tensor desc: " << cudnnGetErrorString(status) << std::endl;
        return -1;
    }

    status = cudnnDestroyFilterDescriptor(filt_desc);
    if(status != CUDNN_STATUS_SUCCESS) {
        std::cerr << "fail to destroy filt desc: " << cudnnGetErrorString(status) << std::endl;
        return -1;
    }

    status = cudnnDestroyTensorDescriptor(in_tensor_desc);
    if(status != CUDNN_STATUS_SUCCESS) {
        std::cerr << "fail to destroy tensor desc: " << cudnnGetErrorString(status) << std::endl;
        return -1;
    }

    status = cudnnDestroyConvolutionDescriptor(conv_desc);
    if(status != CUDNN_STATUS_SUCCESS) {
        std::cerr << "fail to destroy conv desc: " << cudnnGetErrorString(status) << std::endl;
        return -1;
    }

    status = cudnnDestroy(handle);
    if(status != CUDNN_STATUS_SUCCESS) {
        std::cerr << "fail to destroy cudnn handle" << std::endl;
        return -1;
    }

    return 0;
}

Additional information:

When I switch forward algorithm from CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, the error disappeared.

However, I lost the speedup when switching to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM.