How to cudaMalloc two-dimensional array ?

May be a dumb question … however, I still can’t make it work :-)

When allocationg something like this:
int* pArray;
cudaMalloc((void**)&pArray, 10 * sizeof(int));
everything works as expected.

However, what should be done to allocate and array of 10x10 ints ? The following code does not work (the very first malloc corrupts the memory).

int** ppArray;
cudaMalloc((void**)&ppArray, 10 * sizeof(int*));

for (int i = 0; i < 10; i++)
{
cudaMalloc((void**)&ppArray[i], 10 * sizeof(int));
}

Thanks in advance!

Looks like I’ve figured out the answer …
If all rows in the array has the same number of elements - use mallocpitch.

Otherwise, do not use 2d arrays at all, combine them to the single array.

Yes, it seems like most cublas library functions use a 1d array form for 2d arrays. Quite odd at first. I’m still trying to grasp it.

Look “ppArray” contained a GPU memory pointer before. And now, you are writing into &ppArray[i] from the host. That is invalid pointer indirection. YOur program must have segfaulted.

Instead have a proper host array and use it for the cudaMalloc() above and then cudaMemcpy that array of pointers to the pointer that you got from your first cudaMalloc().

It will work! I have done this so many times!!

Chilli

so can you provide the right code? cuz i am not really understanding … is it something like this?

int** ppArray_a, **ppArray_b;

cudaMalloc((void**)&ppArray_a, 10 * sizeof(int*));

cudaMemcpy(ppArray_b, ppArray_a, size, cudaMemcpyDeviceToHost);  

for (int i = 0; i < 10; i++)

{

cudaMalloc((void**)&ppArray_b[i], 10 * sizeof(int));

}

//and now ppArray_b should be 10x10, then do whatever

Thanks for the help

No…

cudaMalloc((void**)&ppArray_a, 10 * sizeof(int*));

for(int i=0; i<10; i++)

{

  cudaMalloc(&someHostArray[i], 100*sizeof(int)); /* Replace 100 with the dimension that u want */

}

cudaMemcpy(ppArray_a, someHostArray, 10*sizeof(int *), cudaMemcpyHostToDevice);

Thats it. HTH

What do you mean when you say, all rows must have “same” number of elements??

A 2D array will have same elements in each row?

Q1. How will you free memory from device 2D array ppArray ?

Would it be like :

for(i = 0; i < 10; i++)

  cudaFree(d_A[i])

cudaFree(d_A);

Q2. I followed your code for allocating and transfering data to device. However, when I retrieve it is 0.

I did following for retrieval

cudaMemcpy(someHostArray, ppArray_a, 10*sizeof(int *), cudaMemcpyDeviceToHost);

If am wrong for Q2 , can you guide me how to transfer data from device to host 2D array?? Thanks.

d_A is a GPU array. So, you cannot index it, like “d_A[i]”, inside CPU code.

YOu have to copy out the array into someHostArray (or you could use the same array that u used while creating the 2D array) and then free them one by one…

Like this:

cudaMemcpy(someHostArray, d_A, N*sizeof(void *), cudaMemcpyDeviceToHost);

for(int i=0; i<N; i++)

{

  cudaFree(someHostArray[i]);

}

cudaFree(d_A);

In the code above, you are copying the POINTERS and not data… I wonder why it is 0. It should be some "cudaMalloc"ed pointer that should be sitting there – which indicates to me that you have constructed the GPU 2D array properly.

Post your entire code. I can correct it and get it working - if u dont mind.

Best Regards,

Sarnath

Hi,

I have a stack of 4000 vectors each of length 360 units. The host data is a 2D matrix of dimension 4000 * 360. My aim is to compute the sum of all components

per vector such that i get 4000*1 result vector .Each element of result vector should be the resulting sum of per vector components.

Something like this:

|<- 3 * 4 matrix->| |<-3*1 result vector->|

v1 [1 2 3 4] [10]

v2 [1 1 1 1] [4]

v3 [9 5 6 7] [27]

I want to compute the above on GPU.

Q1. Is it true that I need to convert my host data to 1D format and then pass it to GPU to represent it as 2D format ?If it is possible to directly copy row by row of Host to row by row at gpu

can you illustrate?

Q2. As far as summation is concerned , I am relying on parallel reduction code from NVIDIA tutorial.

 My approach is following
float* host_A , float* host_result; 

	 host_A = malloc(H*W*sizeof(float));  //H is number of vectors or ht, W is number of elements/row

	 

		   

		 //init host_A 

		 ...

	 float *gpu_A;

		 //allocation at gpu

	 cudamalloc( (void**)&gpu_A , H*sizeof(float*) );

	 for(int i = 0; i < H; i++)

		  cudaMalloc( (void**)&gpu_A[i] , W*sizeof(float) );

	 //copy data

	 cudaMemcpy2D(gpu_A , W*sizeof(float) , host_A , W*sizeof(float) , W*sizeof(float) ,  H, cudaMemcpyHostToDevice);

		 

		  //result vector at gpu

		 cudamalloc( (void**)&gpu_R , H*sizeof(float) );

		 dim3 dimGrid(H,1,1); //4000 blocks

	 dim3 dimBlock(W,1,1); //360 threads per block

	 sum<<<dimGrid , dimBlock , W*sizeof(float)>>> (gpu_A , gpu_R , H,W);

	 

	 //result vector at host 

		 host_R = malloc(H*sizeof(float));

My sum kernel works as:

__global void sum(float d_A , float* d_R , int H, int W )

{

   int id = threadIdx.x + blockIdx.x * blockDim.x; 

int N = H*W;

   int BLK_SZ = blockDim.x;

if(id < N)

  {

	 for(int i = 0; i < H; i++)	  //Do reduction per sheet   

			reduce(d_A + i*W , d_R + i, BLK_SZ , W);   //This a function of type __device__

  }

}

NVIDIA’s reduction function that I am using:

__device__ void reduce(float *g_idata, float *g_odata, unsigned int blockSize, unsigned int n)

{

		extern __shared__ int sdata1[];

		unsigned int tid = threadIdx.x;

		unsigned int i = blockIdx.x*(blockSize*2) + tid;

		unsigned int gridSize = blockSize*2*gridDim.x;

		sdata1[tid] = 0;

		while (i < n) { sdata1[tid] += g_idata[i] + g_idata[i+blockSize]; i += gridSize; }

		__syncthreads();

		if (blockSize >= 512) { if (tid < 256) { sdata1[tid] += sdata1[tid + 256]; } __syncthreads(); }

		if (blockSize >= 256) { if (tid < 128) { sdata1[tid] += sdata1[tid + 128]; } __syncthreads(); }

		if (blockSize >= 128) { if (tid < 64) { sdata1[tid] += sdata1[tid + 64]; } __syncthreads(); }

		if (tid < 32) {

		if (blockSize >= 64) sdata1[tid] += sdata1[tid + 32];

		if (blockSize >= 32) sdata1[tid] += sdata1[tid + 16];

		if (blockSize >= 16) sdata1[tid] += sdata1[tid + 8];

		if (blockSize >= 8) sdata1[tid] += sdata1[tid + 4];

		if (blockSize >= 4) sdata1[tid] += sdata1[tid + 2];

		if (blockSize >= 2) sdata1[tid] += sdata1[tid + 1];

		}

		if (tid == 0) g_odata[blockIdx.x] = sdata1[0];

}

Observations:

  1. The above code computes the sum of the latest sheet correctly whereas the results of previous sheets are erased by junk values.

  2. To do tests, I take H = 1 and W = 30. ------- > I get correct result of 435

    Note that My host array is initialized as [0,1,2,3,4,…29]

                                                         [30,31,32......59] 
    
                                             {H = 2, W = 30 => N = 60 elements }
    

When H = 1 {the number of block is 1}, I get correct result.

When H = 2, I get corret result for the second sheet where as the first sheet computed sum is erased by junk number.

I guess this has something to do with shared variable being overwritten bu I do not know how and where to put __synthreads().

Help will be hugely appreciated.

It seems this would be a really bad way of implementing a 2d array on a gpu. The double pointer indirection on uncached memory will hurt your performance severely, unless you put it in shared memory. Why not just use the standard way of indexing as r+nr*c which just uses a few arithmetic operations?

@lutormx,

Yes, THats the right way to do it. I did post that point in many threads… But I think I forgot to hint that…Good that u brought it up!

You can always do the r +nrc OR rnc + c to get your data unless you have some application specific stuff (for example: excessive interchange of rows)

I had exactly not followed your idea of allocating 2D device pointer at CUDA. Here is what I did:

cudaMalloc( (void**)&d_Samples , N * sizeof(float*) ); 

for(i = 0; i < N; i++)

	 cudaMalloc( (void**)&d_Samples[i] , row_size);

Then I copied data from 1D cpu pointer to allocated 2D device pointer like this

cudaMemcpy2D(d_Samples , row_size , cpu_data , row_size , row_size ,  N , cudaMemcpyHostToDevice);

Then I am freeing memory as

cudaFree(d_Samples);

All the above sequences of operations are getting called inside a loop. Therefore, I need to allocate and then free memory at each iteration. (This is due to the architecture of my project that i can not give details of)

Anyway, coming to the point, At first iteration everything works well but seg-fault occurs at next iteration. Following your post one thing , how will I de-allocate memory from d_Samples ???

Help appreciated.Regards

cirus.

@Sarnath,

This time I went by your advice and found that the way you allocate and de-allocate 2D device pointer using hostarray variable works , but i tried to copy and then retrieve data from host to device array but failed. Here is what I did:

float *d_Ptr; 

	

	//copy data

	float *a = (float*)malloc(15*sizeof(float));

	for(int i = 0; i < 15; i++)

		a[i] = 0.1f*i;

	float ** temp = NULL, **c = NULL; 

	 temp = (float**)malloc(5 * sizeof(float*));

	 c = (float**)malloc(5 * sizeof(float*));

	 for(int i = 0; i < 5; i++)

	 {

		 temp[i] = (float*)malloc(3 * sizeof(float));

		 c[i] = (float*)malloc(3 * sizeof(float));

		 memset(c[i] , 0 , 3 * sizeof(float));

		 //memcpy(temp[i] , a + i*3 , 3*sizeof(float));

	 }

	float *b = (float*)malloc(15*sizeof(float));

		memset(b , 0 , 15*sizeof(float));

	cudaError_t err = cudaSuccess;

	

	err = cudaMalloc( (void**)&d_Ptr , sizeof(float*) * 5 );

	if(err != cudaSuccess)

	{

		printf("Failure in allocating Col array\n");

		exit(1);

	}

	else

		printf("Successful\n");

	for(int i = 0; i < 5; i++)

	{

		

		err = cudaSuccess;

		err = cudaMalloc( (void**)&temp[i] , 3 * sizeof(float) );

		//err = cudaMalloc( (void**)&d_Ptr[i] , 3 * sizeof(float) );

		if(err != cudaSuccess)

		{

			printf("Failure in allocating Row array at %d\n", i);

			exit(1);	

		}

		else

			printf("Successful allocation at %d\n",i);

	}

	err = cudaMemcpy(d_Ptr, temp, 15*sizeof(float*), cudaMemcpyHostToDevice);

//printf("copy data to allocated device var up\n\n");  

	//err = cudaSuccess; 

	//err = cudaMemcpy2D(d_Ptr , 3 * sizeof(float) , a , 3 * sizeof(float) , 3 * sizeof(float) , 5, cudaMemcpyHostToDevice);

	if(err != cudaSuccess)

	{

		printf("Failure in copying data\n");

		exit(1);	

	}

	else

		printf("Successful copying at \n");

	for(int i = 0; i < 5; i++)

		memset(temp[i], 0 , 3*sizeof(float) );

	

	printf("\nretrieve data from device var up\n\n");  

	err = cudaMemcpy2D(temp, 3 * sizeof(float), d_Ptr , 3 * sizeof(float) , 3 * sizeof(float) , 5, cudaMemcpyDeviceToHost);

	err = cudaMemcpy2D(c, 3 * sizeof(float), d_Ptr , 3 * sizeof(float) , 3 * sizeof(float) , 5, cudaMemcpyDeviceToHost);

	//err = cudaMemcpy2D(b, 3 * sizeof(float), d_Ptr , 3 * sizeof(float) , 3 * sizeof(float) , 5, cudaMemcpyDeviceToHost);

	if(err != cudaSuccess)

	{

		printf("Failure in getting data\n");

		exit(1);	

	}

	else

	{

		//for(int i = 0; i < 15; i++)

		//	printf("b[%d]=%f\n",i,b[i]);

		for(int i = 0; i < 5; i++){

			for(int j = 0; j < 3; j++){

				printf("temp[%d][%d]=%f\n",i,j , temp[i][j]);

			}

		}

		printf("\n\n");

		for(int i = 0; i < 5; i++){

			for(int j = 0; j < 3; j++){

				printf("c[%d][%d]=%f\n",i,j , c[i][j]);

			}

		}

		printf("Successful getting at \n");

	}

	

	

	printf("Successful End in allocating & getting data.. now freeing up\n\n");

	 //Free

	 err = cudaSuccess;

	 err = cudaMemcpy(temp, d_Ptr , 5 * sizeof(float*), cudaMemcpyDeviceToHost);

	 for(int i = 0; i < 5; i++)

	 {

		err = cudaSuccess;

		/*

		err = cudaFree((void*)&d_Ptr[i]);

		*/

		err = cudaFree(temp[i]);

		if(err != cudaSuccess)

		{

			printf("Failure in freeing Row at %d\n", i);

			exit(1);	

		}

		else

			printf("Successful free of row %d\n",i);

			

		 

	 }

	 err = cudaSuccess; 

	 err = cudaFree(d_Ptr);

	 	if(err != cudaSuccess)

		{

			printf("Failure in freeing\n");

			exit(1);	

		}

		else

			printf("Successful Freeing\n");

		 printf("ALL DONE!!\n");

Now if I comment stmt cudaMemcpy2D then code works but result is 0.This is the same zero that I talked in one of my earlier posts.

However, if while allocating 2D deviece array I do not use hostarray that is inside loop if i do

cudaMalloc((void**)&d_Temp , s3*sizeof(floaty) );

and then I copy data from host to device using command cudaMemcpy2D, then on retreival I get same data but then it gives seg fault while de-allocating 2D device pointer.

I have tried a lot but not successful in using your tech of allocation and freeing up plus getting exactly same data as put in. Kindly help.

This is the first bug that I encountered. See above. I did not read your code further to it. So, there could be other bugs sitting around…

Ok,

Here is the bug.

You are copying 15 pointers from temp to d_ptr. Both temp and d_ptr can hold only 5 pointers each… So, why are you copying around 15 of them…

As lutormx pointed out, you could also considering represnting the 2D array as a 1D array in the GPU. BUt no harm in trying this out…

Thanks for pointing out this mistake. I realized it after submitting the post :) , but still the problem remains. Still I am not able to get the copeid data.

Can you help me how can you copy data to 2D device pointer and retrieve it.I am struggling. Thank you for your valuable time and help.

Regards.

for(int i = 0; i < 5; i++)

		memset(temp[i], 0 , 3*sizeof(float) );

You are trying to memset GPU pointers to 0 using “memset”. It is not possible.

YOu need to use “cudaMemset” to do it… Isn’t it?

Wow that was a superfast reply :)

Hmm , here I am confused. I allocated h_Temp in CPU using malloc then I use it in allocating memory to each row of d_Ptr (device variable) . this is something udigestable.

Note: My h_Temp is a 2D cpu array.

As an alternative to allocate memory to a 2D device pointer float *d_Ptr I did following:

cudaMalloc(  (void**)&d_Ptr , 5 * sizeof(float*) );   //gives me the list of columns

for(int i = 0; i < 3; i++)

 cudaMalloc( (void**)&d_Ptr[i] , 3 * sizeof(float*)  );

This worked fine.After this I took a 1D CPU array of length 15 * sizeof(float) and copied its contents to d_Ptr using

cudaMemcpy2D(d_Ptr, 3* sizeof(float) , h_data, sizeof(float)* 3 ,  sizeof(float)* 3 , 5 , cudaMemcpyHostToDevice);

Then I am able to correcly retrieve data using cudaMemcpy2D but now with change in direction flag from D->H.

The problem with this approach is at freeing the 2D device pointer d_Temp.

that is when I try o free each row from d_Temp, then 0th row is successfully freed but subsequent rows give seg-fault.

I am not clear with the concept of using host array in allocating memory. I do not dispute your approach as I myself tested it and found it working , but then if I insert the statements of copying and retrieving data then things fail.

I think you are using 2 many things at the same time. FOr a momment, forget cudaMemcpy2D et al…

Can you just post the full source code so that I can compile it, fix it and give you back. Dont include “cutil”… I dont have SDK with me…

Here is the code.Note that I am just allocating and deallocating memory.This works fine. I want to copy contents of variable ‘a’ to d_Ptr and then retrieve it to check.

void Func()

{

float *d_Ptr;

	

	///...............produce data.....................

	float *a = (float*)malloc(15*sizeof(float));

	for(int i = 0; i < 15; i++)

		a[i] = 0.1f*i;

	

	//.................Host array.........................

	float ** temp = NULL;

	temp = (float**)malloc(5 * sizeof(float*));

	for(int i = 0; i < 5; i++)

	{

		 temp[i] = (float*)malloc(3 * sizeof(float));	

	}

	//...........Making...2D Device Array.......................

	cudaError_t err = cudaSuccess;

	err = cudaMalloc( (void**)&d_Ptr , sizeof(float*) * 5 );

	if(err != cudaSuccess)

	{

		printf("Failure in allocating Col array\n");

		exit(1);

	}

	else

		printf("Successful\n");

	for(int i = 0; i < 5; i++)

	{

		

		err = cudaSuccess;

		err = cudaMalloc( (void**)&temp[i] , 3 * sizeof(float) );

		//err = cudaMalloc( (void**)&d_Ptr[i] , 3 * sizeof(float) );

		if(err != cudaSuccess)

		{

			printf("Failure in allocating Row array at %d\n", i);

			exit(1);	

		}

		else

			printf("Successful allocation at %d\n",i);

	 }

	 

	 //..........................Free Memory from cuda var.....................................

	 err = cudaSuccess;

	 err = cudaMemcpy(temp, d_Ptr , 5 * sizeof(float*), cudaMemcpyDeviceToHost);

	 for(int i = 0; i < 5; i++)

	 {

		err = cudaSuccess;

		

		//err = cudaFree((void*)&d_Ptr[i]);

		

		err = cudaFree(temp[i]);

		if(err != cudaSuccess)

		{

			printf("Failure in freeing Row at %d\n", i);

			exit(1);	

		}

		else

			printf("Successful free of row %d\n",i);

			

		

	 }

	 err = cudaSuccess;

	 err = cudaFree(d_Ptr);

	 if(err != cudaSuccess)

	 {

			printf("Failure in freeing\n");

			exit(1);	

	 }

	 else

			printf("Successful Freeing\n");

	 printf("ALL DONE!!\n");

}//end of func

Many Many thanks in advance.