cudamalloc not allocating memeory

I am trying to allocate memory, copy it to the device, perform the calculations on the GPU, copy the results back and then free up the memory i allocated. I wanted to make sure that i wasn’t going over the limit and i wanted to see if i would have enough memory in the shared memory space to dump a few arrays.

I’m allocating the memory, there are no errors being returned. When i use cudaMemGetInfo to check the amount of memory allocated, it looks like one cudaMalloc hasn’t allocated any memory.

Also when i try to free the memory, it looks like only one pointer is free’d.

I am using the matlab Mexfunction interface to setup the GPU memory and launch the kernel. At this point, i’m not even calling into the kernel and just returning back a unit matrix for the results.

...

    cudaError_t cudaErr;

    size_t freeMem = 0;

    size_t totalMem = 0;

    size_t allocMem = 0;

    cudaMemGetInfo(&freeMem, &totalMem);

    mexPrintf("Memory avaliable: Free: %lu, Total: %lu\n",freeMem, totalMem);

/* Pointers for the device memory */

    double *devicePulseDelay, *deviceTarDistance, *deviceScattDistance, *deviceScatterers;

    double *deviceReceivedReal, *deviceReceivedImag;

/* Allocate memory on the device for the arrays. */

    mexPrintf("Allocating memory.\n");

    cudaErr = cudaMalloc( (void **) &devicePulseDelay, sizeof(double)*512);

    if (cudaErr != cudaSuccess){

        mexPrintf("could not allocate memory to devicePulseDelay\n");

        mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));

    }

    cudaMemGetInfo(&allocMem, &totalMem);

    mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));

cudaErr = cudaMalloc( (void **) &deviceTarDistance, sizeof(double)*512);

    if (cudaErr != cudaSuccess){

        mexPrintf("could not allocate memory to deviceTarDistance\n");

        mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));

    }

    cudaMemGetInfo(&allocMem, &totalMem);

    mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));

cudaErr = cudaMalloc( (void **) &deviceScattDistance, sizeof(double)*999*512);

    if (cudaErr != cudaSuccess){

        mexPrintf("could not allocate memory to deviceScattDistance\n");

        mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));

    }

    cudaMemGetInfo(&allocMem, &totalMem);

    mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));

cudaErr = cudaMalloc( (void **) &deviceScatterers, sizeof(double)*999);

    if (cudaErr != cudaSuccess){

        mexPrintf("could not allocate memory to deviceScatterers\n");

        mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));

    }   

    cudaMemGetInfo(&allocMem, &totalMem);

    mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));

cudaErr = cudaMalloc( (void **) &deviceReceivedReal, sizeof(double)*999*512);

    if (cudaErr != cudaSuccess){

        mexPrintf("could not allocate memory to deviceReceivedReal\n");

        mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));

    }

    cudaMemGetInfo(&allocMem, &totalMem);

    mexPrintf("deviceReceivedReal: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));

cudaErr = cudaMalloc( (void **) &deviceReceivedImag, sizeof(double)*999*512);

    if (cudaErr != cudaSuccess){

        mexPrintf("could not allocate memory to deviceReceivedImag\n");

        mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));

    }

    cudaMemGetInfo(&allocMem, &totalMem);

    mexPrintf("deviceReceivedImag: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n", allocMem, totalMem,(freeMem - allocMem));

/* copy the input arrays across to the device */

    mexPrintf("\nCopying memory.\n");

    cudaErr = cudaMemcpy(devicePulseDelay, pulseDelay, sizeof(double)*512,cudaMemcpyHostToDevice);

    if (cudaErr != cudaSuccess) {

        mexPrintf("could not copy to devicePulseDelay\n");

        mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));

    }

    cudaMemGetInfo(&allocMem, &totalMem);

    mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));

    cudaErr = cudaMemcpy(deviceTarDistance, tarDistance, sizeof(double)*512,cudaMemcpyHostToDevice);

    if (cudaErr != cudaSuccess) {

        mexPrintf("could not copy to deviceTarDistance\n");

        mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));

    }

    cudaMemGetInfo(&allocMem, &totalMem);

    mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));

    cudaErr = cudaMemcpy(deviceScattDistance, scattDistance, sizeof(double)*999*512,cudaMemcpyHostToDevice);

    if (cudaErr != cudaSuccess) {

        mexPrintf("could not copy to deviceScattDistance\n");

        mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));

    }

    cudaMemGetInfo(&allocMem, &totalMem);

    mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));

    cudaErr = cudaMemcpy(deviceScatterers, scatterers, sizeof(double)*999,cudaMemcpyHostToDevice);

    if (cudaErr != cudaSuccess) {

        mexPrintf("could not copy to deviceScatterers\n");

        mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));

    }

    cudaMemGetInfo(&allocMem, &totalMem);

    mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));

/* call the kernel */

//     launchKernel<<<1,512>>>(........);             

/* retireve the output */

    cudaErr = cudaMemcpy(receivedReal, deviceReceivedReal, sizeof(double)*512*512,cudaMemcpyDeviceToHost);

    if (cudaErr != cudaSuccess) {

        mexPrintf("could not copy to receivedReal\n");

        mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));

    }

    cudaMemGetInfo(&allocMem, &totalMem);

    mexPrintf("receivedReal: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));

    cudaErr = cudaMemcpy(receivedImag, deviceReceivedImag, sizeof(double)*512*512,cudaMemcpyDeviceToHost);

    if (cudaErr != cudaSuccess) {

        mexPrintf("could not copy to receivedImag\n");

        mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));

    }

    cudaMemGetInfo(&allocMem, &totalMem);

    mexPrintf("receivedImag: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));

/* free the scats memory. */

    mexPrintf("\nFree'ing memory.\n");

    cudaMemGetInfo(&freeMem, &totalMem);

    mexPrintf("Before freeing: Free %lu, Total: %lu\n", freeMem, totalMem);

cudaErr = cudaFree(devicePulseDelay);

    if (cudaErr != cudaSuccess) {

        mexPrintf("could free devicePulseDelay\n");

        mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));

    }

    cudaMemGetInfo(&allocMem, &totalMem);

    mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));

    cudaErr = cudaFree(deviceTarDistance);

    if (cudaErr != cudaSuccess) {

        mexPrintf("could free deviceTarDistance\n");

        mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));

    }

    cudaMemGetInfo(&allocMem, &totalMem);

    mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));

    cudaErr = cudaFree(deviceScattDistance);

    if (cudaErr != cudaSuccess) {

        mexPrintf("could free deviceScattDistance\n");

        mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));

    }

    cudaMemGetInfo(&allocMem, &totalMem);

    mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));

    cudaErr = cudaFree(deviceScatterers);

    if (cudaErr != cudaSuccess) {

        mexPrintf("could free deviceScatterers\n");

        mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));

    }

    cudaMemGetInfo(&allocMem, &totalMem);

    mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));

    cudaErr = cudaFree(deviceReceivedReal);

    if (cudaErr != cudaSuccess) {

        mexPrintf("could free deviceReceivedReal\n");

        mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));

    }

    cudaMemGetInfo(&allocMem, &totalMem);

    mexPrintf("deviceReceivedReal: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));

    cudaErr = cudaFree(deviceReceivedImag);

    if (cudaErr != cudaSuccess) {

        mexPrintf("could free deviceReceivedImag\n");

        mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));

    }

    cudaMemGetInfo(&allocMem, &totalMem);

    mexPrintf("deviceReceivedImag: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));

Here is the output from this:

Memory avaliable: Free: 2523959296, Total: 2818572288

Allocating memory.

devicePulseDelay: Memory avaliable: Free: 2522910720, Total: 2818572288, Consumed: 1048576

deviceTarDistance: Memory avaliable: Free: 2522910720, Total: 2818572288, Consumed: 1048576

deviceScattDistance: Memory avaliable: Free: 2518716416, Total: 2818572288, Consumed: 5242880

deviceScatterers: Memory avaliable: Free: 2517667840, Total: 2818572288, Consumed: 6291456

deviceReceivedReal: Memory avaliable: Free: 2515570688, Total: 2818572288, Consumed: 8388608

deviceReceivedImag: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760

Copying memory.

devicePulseDelay: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760

deviceTarDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760

deviceScattDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760

deviceScatterers: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760

receivedReal: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760

receivedImag: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760

Free’ing memory.

Before freeing: Free 2513473536, Total: 2818572288

devicePulseDelay: Memory avaliable: Free: 2513473536, Total: 2818572288, Free’d: 0

deviceTarDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Free’d: 0

deviceScattDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Free’d: 0

deviceScatterers: Memory avaliable: Free: 2514522112, Total: 2818572288, Free’d: 1048576

deviceReceivedReal: Memory avaliable: Free: 2514522112, Total: 2818572288, Free’d: 1048576

deviceReceivedImag: Memory avaliable: Free: 2514522112, Total: 2818572288, Free’d: 1048576

I feel like there is something obvious that i’m missing. Can anyone help?