Global memory occupied until cudaDeviceReset() or app exits

Hi everyone,

we have found that any application which allocates some global memory, uses it (runs kernels) and then frees it still uses a certain amount of the memory until it calls cudaDeviceReset() or exits. It seems that the amount doesn’t depend on how much memory was allocated and then freed, nor doesn’t it depend on how many times the kernels were fired. Moreover the amount can be significant - with our library it usually takes more than 15 % of the device memory. If we run the app more times the memory occupation still grows until there are problems allocating new memory.

What’s even more strange is that it allocates some small amount of memory on the devices it haven’t use.

The problem is that we cannot use cudaDeviceReset() because our library can be a part of a customer’s solution where we cannot assume that they don’t run anything on the same device from another threads. But it’s possible that our customers would like to run more instances of their app with our library.

Is there any way to get rid of this unwanted memory occupation without calling cudaDeviceReset()?

A simple example of this behavior is visible even with Nvidia Cuda Samples, here with the vectorAdd sample:

/**
 * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/**
 * Vector addition: C = A + B.
 *
 * This sample is a very basic sample that implements element by element
 * vector addition. It is the same as the sample illustrating Chapter 2
 * of the programming guide with some additions like error checking.
 */

#include <stdio.h>

// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>

/**
 * CUDA Kernel Device code
 *
 * Computes the vector addition of A and B into C. The 3 vectors have the same
 * number of elements numElements.
 */
__global__ void
vectorAdd(const float *A, const float *B, float *C, int numElements)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < numElements)
    {
        C[i] = A[i] + B[i];
    }
}


static void keypress_wait(void) {
    fputs("Press any key to exit.\n", stdout);
    fflush(NULL);
    fgetc(stdin);
}

/**
 * Host main routine
 */
int
main(void)
{
    // Error code to check return values for CUDA calls
    cudaError_t err = cudaSuccess;
    
    // Print the vector length to be used, and compute its size
    int numElements = 5000000;
    size_t size = numElements * sizeof(float);
    printf("[Vector addition of %d elements]\n", numElements);

    // Allocate the host input vector A
    float *h_A = (float *)malloc(size);

    // Allocate the host input vector B
    float *h_B = (float *)malloc(size);

    // Allocate the host output vector C
    float *h_C = (float *)malloc(size);

    // Verify that allocations succeeded
    if (h_A == NULL || h_B == NULL || h_C == NULL)
    {
        fprintf(stderr, "Failed to allocate host vectors!\n");
        exit(EXIT_FAILURE);
    }

    // Initialize the host input vectors
    for (int i = 0; i < numElements; ++i)
    {
        h_A[i] = rand()/(float)RAND_MAX;
        h_B[i] = rand()/(float)RAND_MAX;
    }

    // Allocate the device input vector A
    float *d_A = NULL;
    err = cudaMalloc((void **)&d_A, size);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Allocate the device input vector B
    float *d_B = NULL;
    err = cudaMalloc((void **)&d_B, size);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Allocate the device output vector C
    float *d_C = NULL;
    err = cudaMalloc((void **)&d_C, size);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Copy the host input vectors A and B in host memory to the device input vectors in
    // device memory
    printf("Copy input data from the host memory to the CUDA device\n");
    err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Launch the Vector Add CUDA Kernel
    int threadsPerBlock = 256;
    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
    printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
    err = cudaGetLastError();

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Copy the device result vector in device memory to the host result vector
    // in host memory.
    printf("Copy output data from the CUDA device to the host memory\n");
    err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Verify that the result vector is correct
    for (int i = 0; i < numElements; ++i)
    {
        if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5)
        {
            fprintf(stderr, "Result verification failed at element %d!\n", i);
            exit(EXIT_FAILURE);
        }
    }
    printf("Test PASSED\n");

    // Free device global memory
    err = cudaFree(d_A);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
    err = cudaFree(d_B);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
    err = cudaFree(d_C);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Free host memory
    free(h_A);
    free(h_B);
    free(h_C);

    // Reset the device and exit
    //err = cudaDeviceReset();

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    printf("Done\n");

    atexit(keypress_wait);

    exit(0);
}

The only change here is that we commented out cudaDeviceReset() and we add an atexit function so that the app doesn’t exit immediately.

Now these are our nvidia-smi outputs:

  • Without anything
  • +------------------------------------------------------+
    | NVIDIA-SMI 331.62     Driver Version: 331.62         |
    |-------------------------------+----------------------+----------------------+
    | GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
    | Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
    |===============================+======================+======================|
    |   0  GeForce GTX 750     On   | 0000:01:00.0     N/A |                  N/A |
    | 40%   38C  N/A     N/A /  N/A |      5MiB /  1023MiB |     N/A      Default |
    +-------------------------------+----------------------+----------------------+
    |   1  GeForce GTX 650 Ti  On   | 0000:02:00.0     N/A |                  N/A |
    | 37%   38C  N/A     N/A /  N/A |      5MiB /  1023MiB |     N/A      Default |
    +-------------------------------+----------------------+----------------------+
    |   2  GeForce GTX TITAN   On   | 0000:03:00.0     N/A |                  N/A |
    | 36%   55C  N/A     N/A /  N/A |     14MiB /  6143MiB |     N/A      Default |
    +-------------------------------+----------------------+----------------------+
    
    +-----------------------------------------------------------------------------+
    | Compute processes:                                               GPU Memory |
    |  GPU       PID  Process name                                     Usage      |
    |=============================================================================|
    |    0            Not Supported                                               |
    |    1            Not Supported                                               |
    |    2            Not Supported                                               |
    +-----------------------------------------------------------------------------+
    
  • One instance of the vectorAdd app:
  • +------------------------------------------------------+
    | NVIDIA-SMI 331.62     Driver Version: 331.62         |
    |-------------------------------+----------------------+----------------------+
    | GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
    | Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
    |===============================+======================+======================|
    |   0  GeForce GTX 750     On   | 0000:01:00.0     N/A |                  N/A |
    | 40%   37C  N/A     N/A /  N/A |      6MiB /  1023MiB |     N/A      Default |
    +-------------------------------+----------------------+----------------------+
    |   1  GeForce GTX 650 Ti  On   | 0000:02:00.0     N/A |                  N/A |
    | 36%   37C  N/A     N/A /  N/A |      6MiB /  1023MiB |     N/A      Default |
    +-------------------------------+----------------------+----------------------+
    |   2  GeForce GTX TITAN   On   | 0000:03:00.0     N/A |                  N/A |
    | 37%   57C  N/A     N/A /  N/A |     82MiB /  6143MiB |     N/A      Default |
    +-------------------------------+----------------------+----------------------+
    
    +-----------------------------------------------------------------------------+
    | Compute processes:                                               GPU Memory |
    |  GPU       PID  Process name                                     Usage      |
    |=============================================================================|
    |    0            Not Supported                                               |
    |    1            Not Supported                                               |
    |    2            Not Supported                                               |
    +-----------------------------------------------------------------------------+
    
  • With 10 vectorAdd instances:
  • +------------------------------------------------------+
    | NVIDIA-SMI 331.62     Driver Version: 331.62         |
    |-------------------------------+----------------------+----------------------+
    | GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
    | Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
    |===============================+======================+======================|
    |   0  GeForce GTX 750     On   | 0000:01:00.0     N/A |                  N/A |
    | 40%   39C  N/A     N/A /  N/A |      8MiB /  1023MiB |     N/A      Default |
    +-------------------------------+----------------------+----------------------+
    |   1  GeForce GTX 650 Ti  On   | 0000:02:00.0     N/A |                  N/A |
    | 39%   39C  N/A     N/A /  N/A |      8MiB /  1023MiB |     N/A      Default |
    +-------------------------------+----------------------+----------------------+
    |   2  GeForce GTX TITAN   On   | 0000:03:00.0     N/A |                  N/A |
    | 37%   59C  N/A     N/A /  N/A |    700MiB /  6143MiB |     N/A      Default |
    +-------------------------------+----------------------+----------------------+
    
    +-----------------------------------------------------------------------------+
    | Compute processes:                                               GPU Memory |
    |  GPU       PID  Process name                                     Usage      |
    |=============================================================================|
    |    0            Not Supported                                               |
    |    1            Not Supported                                               |
    |    2            Not Supported                                               |
    +-----------------------------------------------------------------------------+
    
  • With 84 instances of vectorAdd:
  • +------------------------------------------------------+
    | NVIDIA-SMI 331.62     Driver Version: 331.62         |
    |-------------------------------+----------------------+----------------------+
    | GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
    | Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
    |===============================+======================+======================|
    |   0  GeForce GTX 750     On   | 0000:01:00.0     N/A |                  N/A |
    | 40%   40C  N/A     N/A /  N/A |     26MiB /  1023MiB |     N/A      Default |
    +-------------------------------+----------------------+----------------------+
    |   1  GeForce GTX 650 Ti  On   | 0000:02:00.0     N/A |                  N/A |
    | 40%   40C  N/A     N/A /  N/A |     26MiB /  1023MiB |     N/A      Default |
    +-------------------------------+----------------------+----------------------+
    |   2  GeForce GTX TITAN   On   | 0000:03:00.0     N/A |                  N/A |
    | 38%   60C  N/A     N/A /  N/A |   5707MiB /  6143MiB |     N/A      Default |
    +-------------------------------+----------------------+----------------------+
    
    +-----------------------------------------------------------------------------+
    | Compute processes:                                               GPU Memory |
    |  GPU       PID  Process name                                     Usage      |
    |=============================================================================|
    |    0            Not Supported                                               |
    |    1            Not Supported                                               |
    |    2            Not Supported                                               |
    +-----------------------------------------------------------------------------+
    

    Running another vectorAdd instance when 84 of them already running will tell

    Failed to allocate device vector A (error code out of memory)!