Memory increases at end of kernel

Dear users, I can’t found any topic like this.
My question is why memory increases when the kernel is ending? (I watched it with nvidia-smi). In my example I’m using 7KB of memory but when watch application is close of 90, 200 and up to 700MB (when finish the execution)

There is any difference between memory allocated x memory-usage of kernel (how to calculate it)?

show us your code :)

//~ nvcc -g -G -rdc=true -arch=sm_35 -o t123 t123.cu -lcublas -lcublas_device -lcudadevrt
//~ sudo optirun --no-xorg ./t123
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <cublas_v2.h>

#define _size 256
#define _GENES 7
#define _MAXGB 536870912
//~ #define _MAXGB 1073741824

const int MINDEGREE = 2;
const int MAXDEGREE = 3;
const int MINSIZE = 5;

#define cudaCheckErrors(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){
   if (code != cudaSuccess) {
      fprintf(stderr,"GPUassert: Error(%d) %s %s in line %d\n", (int)code, cudaGetErrorString(code), file, line);
      if (abort)
		exit(code);
   }
}

struct typebar{
   int inh;
   int non;
   int act;
};
typedef struct typebar typebar;

enum IRT{
	IRT_EQ = 0,
	IRT_LE,//<
	IRT_LQ,//<=
	IRT_GR,//>
	IRT_GQ //>=
};

__host__ __device__ float* dec2bin(int num, int sizeColunmArray){
	float *arr, aux;
	int i=0;
	int r;
	
	arr = new float;
	if(arr == NULL){
		printf("Error to allocate arr in dec2bin().\n");
		assert(arr);
	}	
	memset(arr, 0, sizeColunmArray*sizeof(float));
	
	while(num != 0){
		r = num%2;
		arr[i++] = static_cast<float> (r);
		num /= 2;
	}

	int j;
	for(j=0;j<i/2; j++){
		aux = arr[j];
		arr[j] = arr[i-j-1];
		arr[i-j-1] = aux;
	}
	//realloc vector
	if(i<sizeColunmArray){
		float *ptr;
		ptr = new float;
		if(ptr == NULL){
			printf("Error to allocate ptr in dec2bin().\n");
			assert(ptr);
		}	
		memset(ptr, 0, sizeColunmArray*sizeof(float));
		for(j=0; j<i; j++){
			ptr = arr[j];
		}
		
		delete [] arr;
		arr = NULL;		
		
		return ptr;
	}
	return arr;
}

__host__ __device__ unsigned int nextPow2(unsigned int x){
    --x;
    x |= x >> 1;
    x |= x >> 2;
    x |= x >> 4;
    x |= x >> 8;
    x |= x >> 16;
    return ++x;
}

__device__ void retifica(float *gene_value, char *retify, char *invalid_all, int i, float value){
	
	if(retify[i] == '0' && *invalid_all == '0'){
		gene_value[i] = value;
		retify[i] = '1';
	}
	else if(gene_value[i] != value && retify[i] == '1'){
		*invalid_all = '1';
	}
}

__global__ void kernel_getArrayFill(float *gene_value, char *retify, float *ind, float *array, int line){
	int tid = threadIdx.x;
	int size = blockDim.x;
	
	if(retify[tid] == '1')
		array[line*size + tid] = gene_value[tid];
	else
		array[line*size + tid] = ind[tid]-1;
}

__device__ void getNumBlocksAndThreads(int n, int maxThreads, int &blocks, int &threads){
	threads = (n < maxThreads) ? nextPow2(n) : maxThreads;
	blocks = (n + threads - 1) / threads;
}

__device__ float* get_array(float **gene_value, char **retify, char invalid_all, int size_columns, int *sizeArray){
	int i, line, quant_indices=1;
	float *ind, first_ind;
	float *array = NULL;
	int size_total = 0;
	
	if(invalid_all == '0'){
		for(i=0; i<size_columns; i++){
			if((*retify)[i] == '0'){
				quant_indices *= 3;
			}
		}
		size_total = quant_indices * size_columns;
		array = new float; 
		assert(array);
		ind = new float;
		assert(ind);
		
		memset(ind, 0, size_columns*sizeof(float));
		((*retify)[0] == '0') ? first_ind = 3.f : first_ind = 1.f;
		line = 0;
		while(ind[0] < first_ind){			
			#if (__CUDA_ARCH__ >= 350 )
				kernel_getArrayFill<<<1, size_columns>>>(*gene_value, *retify, ind, array, line);
				cudaDeviceSynchronize();
			#else
				for(i=0; i<size_columns; i++){
					if((*retify)[i] == '1')
						array[line*size_columns+i] = (*gene_value)[i];
					else
						array[line*size_columns+i] = ind[i]-1;
				}
			#endif


			for(i = size_columns-1; i > -1; i--){
				ind[i]++;
				if((*retify)[i] == '0'){
					if(ind[i] < 3 || i==0){
						break;
					}
					else{
						ind[i] = 0;
					}
				}
				else{
					if(ind[i] < 1 || i == 0){
						break;
					}
					else{
						ind[i] = 0;
					}
				}
			}
			line++;
		}
		delete [] ind;
		ind = NULL;
	}
	delete [] *gene_value;
	*gene_value = NULL;
	delete [] *retify;
	*retify = NULL;

	*sizeArray = size_total;
	
	return array;
}

__global__ void kernel_executa_linear_checking_sem_remover(float *result, char *invalids, int rel, int size){
	int tid = (blockIdx.x  + gridDim.x  * blockIdx.y) * (blockDim.x * blockDim.y) + (threadIdx.x + blockDim.x * threadIdx.y);
	
	if(tid < size){
		if( invalids[tid] == '0' ){
			if(rel == IRT_LE){
				if( result[tid] >= 0 ){
					invalids[tid]='1';
				}
			}
			else if( rel == IRT_LQ ){
				if( result[tid] > 0 ){
					invalids[tid]='1';			
				}
			}
			else if( rel == IRT_GR ){
				if( result[tid] <= 0 ){
					invalids[tid]='1';
				}
			}
			else if( rel == IRT_GQ ){
				if( result[tid] < 0 ){
					invalids[tid]='1';			
				}
			}
			else if( rel == IRT_EQ ){
				if( result[tid] != 0 ){
					invalids[tid]='1';
				}
			}
		}	
	}	
}

__device__ void linear_handle(cublasHandle_t cublashandle, cublasStatus_t cublasstat, float *matrix, int lines, int columns, char *invalids, IRT *_rel, int *num, int tamanho_rel, float *ptrAlpha, float *ptrBeta){

//~ __device__ void linear_handle(float *matrix, int lines, int columns, char *invalids, IRT *_rel, int *num, int tamanho_rel){
	float *vetorA = NULL, *result;
	
	result = new float[lines];			
	assert(result);

	int i;	
	for(i=0; i<tamanho_rel; i++){
		vetorA = dec2bin(num[i], columns);
		memset(result, 0, lines*sizeof(float));
		
		//chamada ao cublas com vetorA
		//cublasSgemv(cublashandle, CUBLAS_OP_T, lines, columns, ptrAlpha, matrix, columns, vetorA, 1, ptrBeta, result, 1);
		if(lines >= columns){	
			cublasstat = cublasSgemv(cublashandle, CUBLAS_OP_T, columns, lines, ptrAlpha, matrix, columns, vetorA, 1, ptrBeta, result, 1);
		}
		else{
			cublasstat = cublasSgemv(cublashandle, CUBLAS_OP_T, lines, columns, ptrAlpha, matrix, columns, vetorA, 1, ptrBeta, result, 1);
		}
		cudaDeviceSynchronize();
		
		if (cublasstat != CUBLAS_STATUS_SUCCESS) {
			printf("CUBLAS cublasSgemv() failed in executa_linear_sem_remover() %d\n", (int)cublasstat);
			return;
		}
		#if (__CUDA_ARCH__ >= 350 )
			int blocos=0, threads=0;
			getNumBlocksAndThreads(lines, 16, blocos, threads);
			kernel_executa_linear_checking_sem_remover<<<blocos, threads>>>(result, invalids, _rel[i], lines);
			cudaDeviceSynchronize();
		#endif
		delete [] vetorA;
		vetorA = NULL;
	}		

	delete [] result;
	result = NULL;
}

__device__ void linear_handle2d(cublasHandle_t cublashandle, cublasStatus_t cublasstat, float *matrix, int lines, int columns, char *invalids, float *vetorA, IRT _rel, float *ptrAlpha, float *ptrBeta){
	float *result;
	result = new float[lines];
	assert(result);
	memset(result, 0, lines*sizeof(float));
	
	//~ chamada ao cublas com vetorA
	//~ cublasSgemv(cublashandle, CUBLAS_OP_T, lines, columns, ptrAlpha, matrix, columns, vetorA, 1, ptrBeta, result, 1);

	if(lines >= columns){
		cublasstat = cublasSgemv(cublashandle, CUBLAS_OP_T, columns, lines, ptrAlpha, matrix, columns, vetorA, 1, ptrBeta, result, 1);
		cudaDeviceSynchronize();
		if (cublasstat != CUBLAS_STATUS_SUCCESS) {
			printf("CUBLAS cublasSgemv() failed in executa_linear_sem_remover2d first half lines >= columns\n");
			return;
		}
		cublasstat = cublasSgemv(cublashandle, CUBLAS_OP_T, columns, lines, ptrAlpha, matrix, columns, vetorA+columns, 1, ptrBeta, result, 1);
		cudaDeviceSynchronize();
		if (cublasstat != CUBLAS_STATUS_SUCCESS) {
			printf("CUBLAS cublasSgemv() failed in executa_linear_sem_remover2d second half lines >= columns\n");
			return;
		}
	}
	else{
		cublasstat = cublasSgemv(cublashandle, CUBLAS_OP_T, lines, columns, ptrAlpha, matrix, columns, vetorA, 1, ptrBeta, result, 1);
		cudaDeviceSynchronize();
		if (cublasstat != CUBLAS_STATUS_SUCCESS) {
			printf("CUBLAS cublasSgemv() failed in executa_linear_sem_remover2d first half lines < columns\n");
			return;
		}
		cublasstat = cublasSgemv(cublashandle, CUBLAS_OP_T, lines, columns, ptrAlpha, matrix, columns, vetorA+columns, 1, ptrBeta, result, 1);
		cudaDeviceSynchronize();
		if (cublasstat != CUBLAS_STATUS_SUCCESS) {
			printf("CUBLAS cublasSgemv() failed in executa_linear_sem_remover2d second half lines < columns\n");
			return;
		}
	}
	#if (__CUDA_ARCH__ >= 350 )		
		int blocos=0, threads=0;
		getNumBlocksAndThreads(lines, 16, blocos, threads);
		kernel_executa_linear_checking_sem_remover<<<blocos, threads>>>(result, invalids, _rel, lines);
		cudaDeviceSynchronize();
	#endif
	//~ printf("%c %c %c %c %c\n", invalids[215], invalids[216], invalids[217], invalids[218], invalids[219]);

	delete [] result;
	result = NULL;

	//~ cublasDestroy(cublashandle);
}

__global__ void kernel_count_invalids(char *invalids, int *lineInvalids, int size){
	int tid = (blockIdx.x  + gridDim.x  * blockIdx.y) * (blockDim.x * blockDim.y) + (threadIdx.x + blockDim.x * threadIdx.y);
	
	if(tid < size){
		if(invalids[tid] == '1'){
			atomicAdd(lineInvalids, 1);
		}		
	}
}

__host__ __device__ void remove_invalids(float **matrix, int *lines, int columns, char *invalids, int sizeInvalids){
	int newline = *lines - sizeInvalids;
	int newsizearray = newline*columns;
	float *newarray = new float[newsizearray];
	assert(newarray);
	
	int inicio = 0;
	int i;
	
	for(i=0; i<(*lines); i++){
		if(invalids[i] == '0'){
			memcpy(newarray+(inicio), (*matrix)+(i*columns), columns*sizeof(float));
			inicio = inicio + columns;
		}			
	}
	
	if(*matrix){		
		delete [] (*matrix);
		*matrix = NULL;
	}	
	
	*matrix = newarray;
	*lines = newline;	
}

__host__ __device__ void removendo_invalidos(float **matrix, int *lines, int columns, char **invalids){
	int *sizeInvalids = new int(0);
	
	#if (__CUDA_ARCH__ >= 350 )		
		int blocos=0, threads=0;
		getNumBlocksAndThreads(*lines, 16, blocos, threads);
		kernel_count_invalids<<<blocos, threads>>>(*invalids, sizeInvalids, *lines);
		cudaDeviceSynchronize();
	#endif
	remove_invalids(&(*matrix), lines, columns, *invalids, *sizeInvalids);
	//~ for(int i=0; i<(*lines); i++){
		//~ for(int j=0; j<_GENES; j++){
			//~ printf("%.f ", matrix[i*_GENES+j]);
		//~ }
		//~ printf("\n");
	//~ }
	
	delete sizeInvalids;
	sizeInvalids = NULL;
	delete [] *invalids;
	*invalids = NULL;
}

__device__ void teste_dd(cublasHandle_t cublashandle, cublasStatus_t cublasstat, float **meu_vetor, int *meu_lines){
	float *gene_value = new float[_GENES];
	assert(gene_value);
	char *retify = new char[_GENES], invalid_all;
	assert(retify);
	memset(gene_value, 0, _GENES*sizeof(float));
	memset(retify, '0', _GENES*sizeof(char));
	invalid_all = '0';
	retifica(gene_value, retify, &invalid_all, 0, -1);
	retifica(gene_value, retify, &invalid_all, 1, 1);
	if(threadIdx.x == 2){
		retifica(gene_value, retify, &invalid_all, 2, 1);
	}
	int meu_size = 0;
	*meu_vetor = get_array(&gene_value, &retify, invalid_all, _GENES, &meu_size);
	*meu_lines = meu_size/_GENES;
	
	char *invalids = new char[*meu_lines];
	assert(invalids);
	memset(invalids, '0', (*meu_lines)*sizeof(char));
	
	float alpha = 1.f, beta = 1.f;
	float *ptr_alpha = &alpha, *ptr_beta = β 
	
	float *num2 = new float[14];
	assert(num2);
	num2[0] = 0;
	num2[1] = 0;
	num2[2] = 0;
	num2[3] = 0;
	num2[4] = 0;
	num2[5] = 0;
	num2[6] = 0;
	num2[7] = 1;
	num2[8] = 1;
	num2[9] = 0;
	num2[10] = 0;
	num2[11] = 0;
	num2[12] = 1;
	num2[13] = 1;
	//~ IRT rel[1]={IRT_GR};
	//~ int num1[1]={9};	
	//~ linear_handle(cublashandle, cublasstat, *meu_vetor, *meu_lines, _GENES, invalids, rel, num1, 1, ptr_alpha, ptr_beta);
	linear_handle2d(cublashandle, cublasstat, *meu_vetor, *meu_lines, _GENES, invalids, num2, IRT_GQ, ptr_alpha, ptr_beta);
	removendo_invalidos(&(*meu_vetor), &(*meu_lines), _GENES, &invalids);
	
	delete [] num2;
	num2 = NULL;
}

__device__ bool CheckDegree(float *row, int size){
	int degree, i;

	degree = 0;
	for (i = 0; i < size; i++) {
	  if (row[i] != 0) {
		 degree++;
	  }
	}

	return (degree >= MINDEGREE && degree <= MAXDEGREE)? true : false;
}

__device__ void BarChart_per_block_atomic(float *row, typebar *A, int size){
	int j;

	for (j = 0; j < size; j++) {
		if (row[j] == -1) {
			atomicAdd(&A[j].inh, 1);
		} else if (row[j] == 0) {
			atomicAdd(&A[j].non, 1);
		} else{
			atomicAdd(&A[j].act, 1);
		}
	}
}

__global__ void kernel_conta_atividade(float *matrix, int lines, int columns, typebar *bar, int *rows_gi){
	int tid = (blockIdx.x  + gridDim.x  * blockIdx.y) * (blockDim.x * blockDim.y) + (threadIdx.x + blockDim.x * threadIdx.y);
	
	if(tid < lines){
		float *solucao = matrix+(tid*columns);
		if(columns <= MINSIZE || CheckDegree(solucao, columns)){
			BarChart_per_block_atomic(solucao, bar, columns);
			atomicAdd(rows_gi, 1);
		}
	}	
}

__global__ void kerneldd(){
	cublasHandle_t cublashandle;
	
	cublasStatus_t cublasstat = cublasCreate(&cublashandle);
	if (cublasstat != CUBLAS_STATUS_SUCCESS) {
		printf("CUBLAS initialization failed in %s()\n", __FUNCTION__);
		return;
	}
	cublasstat = cublasSetPointerMode(cublashandle, CUBLAS_POINTER_MODE_DEVICE);
	if (cublasstat != CUBLAS_STATUS_SUCCESS) {
		printf("CUBLAS initialization of cublasSetPointerMode failed in %s\n", __FUNCTION__);
		return;
	}	
	
	float *meu_vetor = NULL;
	int meu_lines = 0;
	teste_dd(cublashandle, cublasstat, &meu_vetor, &meu_lines);
	__shared__ typebar *bar;
	if(threadIdx.x == 0){
		bar = new typebar[_GENES*_GENES];
		assert(bar);
		memset(bar, 0, _GENES*_GENES*sizeof(typebar));
	}
	int blocos=0, threads=0;
	int *ptr_rows_gi = new int(0);
	getNumBlocksAndThreads(meu_lines, 16, blocos, threads);
	kernel_conta_atividade<<<blocos, threads>>>(meu_vetor, meu_lines, _GENES, bar+(threadIdx.x*_GENES), ptr_rows_gi);
	cudaDeviceSynchronize();
	printf("%d %d   %d %d %d   %d %d %d   %d %d %d   %d %d %d   %d %d %d   %d %d %d   %d %d %d\n", threadIdx.x, *ptr_rows_gi, bar[threadIdx.x*_GENES].inh, bar[threadIdx.x*_GENES].non, bar[threadIdx.x*_GENES].act, bar[threadIdx.x*_GENES+1].inh, bar[threadIdx.x*_GENES+1].non, bar[threadIdx.x*_GENES+1].act, bar[threadIdx.x*_GENES+2].inh, bar[threadIdx.x*_GENES+2].non, bar[threadIdx.x*_GENES+2].act, bar[threadIdx.x*_GENES+3].inh, bar[threadIdx.x*_GENES+3].non, bar[threadIdx.x*_GENES+3].act, bar[threadIdx.x*_GENES+4].inh, bar[threadIdx.x*_GENES+4].non, bar[threadIdx.x*_GENES+4].act, bar[threadIdx.x*_GENES+5].inh, bar[threadIdx.x*_GENES+5].non, bar[threadIdx.x*_GENES+5].act, bar[threadIdx.x*_GENES+6].inh, bar[threadIdx.x*_GENES+6].non, bar[threadIdx.x*_GENES+6].act);
	
	if(threadIdx.x == 0){		
		delete [] bar;
		bar = NULL;
	}
	delete [] meu_vetor;
	meu_vetor = NULL;
	delete ptr_rows_gi;
	ptr_rows_gi = NULL;
	cublasDestroy(cublashandle);
}

void teste(){
	cudaDeviceSetLimit(cudaLimitMallocHeapSize, _MAXGB);
	//allocate array of _GENES positions, retifica() sets pos X to value Y,
	//get_array() combines all possibilities of array (generating matrix),
	//linear_handle() and line_handle2d() manipulates with
	//cublas (cublasSgemv), remove_invalids() removes all lines that is invalids of matrix
	//kernel_conta_atividade() counts numbers of -1, .., 1 
	//prints number of -1,..,1 end program
	kerneldd<<<1, _GENES>>>();
	cudaCheckErrors(cudaDeviceSynchronize());
}

int main(int argc, char **argv){
	teste();
	
	return 0;
}

In teste() I talk what program do. I noticed that if takeoff cudaDeviceSetLimit the memory not increase, this implies on memory increase?

Yes, this will cause an increase in memory used by the application:

#define _MAXGB 536870912
//~ #define _MAXGB 1073741824

	cudaDeviceSetLimit(cudaLimitMallocHeapSize, _MAXGB);

That will add about an extra 540MB to the application memory footprint.

but this is not just a configuration heap for malloc/new (default is 8MB)? Why he allocates without using all 540MB?
How can use more than 8MB without providing size?

It’s a reservation. When your application runs, it reserves that amount, so that the expected in-kernel allocations to be made will be satisfied. This reservation effectively reduces the amount of free memory, which is actually what you are observing with nvidia-smi.

As a programmer, if you think carefully about this, I think you will agree that it is desirable to be able to reserve the expected amount of memory your in-kernel allocations will use, so that no allocations will fail, regardless of other GPU activity (other kernels, other allocations, other applications, other users).