Thrust: Out of memory for large array

I am using MPI and CUDA thrust.

I have MPI code as follows:

#include "mpi.h"

	#include <stdio.h>

	#include <stdlib.h>

	#include <string.h>

	#include <time.h>

	#include <sys/time.h>

	#include <sys/resource.h>

	#define  MASTER		0

	#define ARRAYSIZE 40000000

	

	int *masterarray;

	int *onearray;

	int *twoarray;

	int *threearray;

	int *fourarray;

	int *fivearray;

	int *sixarray;

	int *sevenarray;

	int *eightarray;

	int *ninearray;      

	int main(int argc, char* argv[])

	{

	  int   numtasks, taskid,chunksize, namelen; 

	  int mysum,one,two,three,four,five,six,seven,eight,nine;

	char myname[MPI_MAX_PROCESSOR_NAME];

	MPI_Status status;

	int a,b,c,d,e,f,g,h,i,j;

	/***** Initializations *****/

	MPI_Init(&argc, &argv);

	MPI_Comm_size(MPI_COMM_WORLD, &numtasks);

	MPI_Comm_rank(MPI_COMM_WORLD,&taskid); 

	MPI_Get_processor_name(myname, &namelen);

	printf ("MPI task %d has started on host %s...\n", taskid, myname);

	//chunksize = 20000000;

	masterarray= malloc(ARRAYSIZE * sizeof(int));

	onearray= malloc(ARRAYSIZE * sizeof(int));

	twoarray= malloc(ARRAYSIZE * sizeof(int));

	threearray= malloc(ARRAYSIZE * sizeof(int));

	fourarray= malloc(ARRAYSIZE * sizeof(int));

	fivearray= malloc(ARRAYSIZE * sizeof(int));

	sixarray= malloc(ARRAYSIZE * sizeof(int));

	sevenarray= malloc(ARRAYSIZE * sizeof(int));

	eightarray= malloc(ARRAYSIZE * sizeof(int));

	ninearray= malloc(ARRAYSIZE * sizeof(int));

	

	/***** Master task only ******/

	if (taskid == MASTER){

           for(a=0;a<ARRAYSIZE;a++){

                 masterarray[a] = 1;

            }

	   mysum = run_kernel0(masterarray,ARRAYSIZE,taskid, myname);

	 }  /* end of master section */

	  if (taskid > MASTER) {

             if(taskid == 1){

                for(b=0;b<ARRAYSIZE;b++){

                onearray[b] = 1;

            }

                 one = run_kernel0(onearray,ARRAYSIZE,taskid, myname);

             }

             if(taskid == 2){

                for(c=0;c<ARRAYSIZE;c++){

                 twoarray[c] = 1;

            }

                 two = run_kernel0(twoarray,ARRAYSIZE,taskid, myname);

             }

             if(taskid == 3){

                 for(d=0;d<ARRAYSIZE;d++){

                 threearray[d] = 1;

                  }

                  three = run_kernel0(threearray,ARRAYSIZE,taskid, myname);

             }

	     if(taskid == 4){

                   for(e=0;e<ARRAYSIZE;e++){

                      fourarray[e] = 1;

                  }

                 four = run_kernel0(fourarray,ARRAYSIZE,taskid, myname);

             }

             if(taskid == 5){

                for(f=0;f<ARRAYSIZE;f++){

                  fivearray[f] = 1;

                  }

                five = run_kernel0(fivearray,ARRAYSIZE,taskid, myname);

             }

             if(taskid == 6){

for(g=0;g<ARRAYSIZE;g++){

                 sixarray[g] = 1;

                }

                 six = run_kernel0(sixarray,ARRAYSIZE,taskid, myname);

             }	

             if(taskid == 7){

                    for(h=0;h<ARRAYSIZE;h++){

                    sevenarray[h] = 1;

                  }

                   seven = run_kernel0(sevenarray,ARRAYSIZE,taskid, myname);

             }	

             if(taskid == 8){

for(i=0;i<ARRAYSIZE;i++){

                  eightarray[i] = 1;

                }

                   eight = run_kernel0(eightarray,ARRAYSIZE,taskid, myname);

             }	

             if(taskid == 9){

for(j=0;j<ARRAYSIZE;j++){

                 ninearray[j] = 1;

                   }

                   nine = run_kernel0(ninearray,ARRAYSIZE,taskid, myname);

             }	

	   }

	 MPI_Finalize();

	}

And my cuda thrust code:

#include <stdio.h>

	#include <cutil_inline.h>

	#include <cutil.h>

	#include <thrust/version.h>

	#include <thrust/generate.h>

	#include <thrust/host_vector.h>

	#include <thrust/device_vector.h>

	#include <thrust/functional.h>

	#include <thrust/transform_reduce.h>

	#include <time.h>

	#include <sys/time.h>

	#include <sys/resource.h>

	extern "C"

	int run_kernel0( int array[], int nelements, int taskid, char hostname[])

	{

	        

	   float elapsedTime;int d_sum;          

	   cudaEvent_t start, stop;

	   cudaEventCreate(&start);

	   cudaEventCreate(&stop);

	   cudaEventRecord(start, 0);

thrust::device_vector<int> gpuarray(data, data + nelements);

	   d_sum = thrust::reduce(gpuarray.begin(),gpuarray.end());

	    cudaEventRecord(stop, 0);

	    cudaEventSynchronize(stop);     

	    cudaEventElapsedTime(&elapsedTime, start, stop);

	    cudaEventDestroy(start);

	    cudaEventDestroy(stop);

	      

	  printf(" Task %d has sum (on GPU): %ld Time for the kernel: %f ms \n", taskid, d_sum, elapsedTime); 

	  

	 return d_sum;

The code works when ARRAYSIZE is 20000000 but fails when I increase it.

Not sure what is causing the problem.

Here is the trace -

terminate called after throwing an instance of ‘thrust::system::detail::bad_alloc’

what(): std::bad_alloc: out of memory

*** Process received signal ***

Signal: Aborted (6)

Signal code: (-6)

[ 0] [0xe1640c]

[ 1] /lib/libc.so.6(abort+0x17a) [0x5b43ca]

[ 2] /usr/lib/libstdc++.so.6(_ZN9__gnu_cxx27__verbose_terminate_handlerEv+0x167) [0x254327]

[ 3] /usr/lib/libstdc++.so.6(-0xff5b0e7a) [0x252186]

[ 4] /usr/lib/libstdc++.so.6(-0xff5b0e3d) [0x2521c3]

[ 5] /usr/lib/libstdc++.so.6(-0xff5b0cfe) [0x252302]

[ 6] mpi_array_distributed(_ZN6thrust6detail7backend4cuda6mallocILj0EEENS_10device_ptrIvEEj+0x17b) [0x805ff27]

[ 7] mpi_array_distributed(_ZN6thrust6detail7backend8dispatch6mallocILj0EEENS_10device_ptrIvEEjNS0_21cuda_device_space_tagE+0x19) [0x805fa12]

[ 8] mpi_array_distributed(_ZN6thrust13device_mallocEj+0x1d) [0x805f563]

[ 9] mpi_array_distributed(_ZN6thrust13device_mallocIiEENS_10device_ptrIT_EEj+0x23) [0x8061ef3]

[10] mpi_array_distributed(_ZN6thrust23device_malloc_allocatorIiE8allocateEjNS_10device_ptrIKiEE+0x5e) [0x8061886]

[11] mpi_array_distributed(_ZN6thrust6detail18contiguous_storageIiNS_23device_malloc_allocatorIiEEE8allocateEj+0x3f) [0x806136f]

[12] mpi_array_distributed(_ZN6thrust6detail11vector_baseIiNS_23device_malloc_allocatorIiEEE17allocate_and_copyIPiEEvjT_S7_RNS0_18contiguous_storageIiS3_EE+0x158) [0x8060fb8]

[13] mpi_array_distributed(_ZN6thrust6detail11vector_baseIiNS_23device_malloc_allocatorIiEEE10range_initIPiEEvT_S7_NS0_17integral_constantIbLb0EEE+0x42) [0x8060a0e]

[14] mpi_array_distributed(_ZN6thrust6detail11vector_baseIiNS_23device_malloc_allocatorIiEEE13init_dispatchIPiEEvT_S7_NS0_17integral_constantIbLb0EEE+0x23) [0x80602cd]

[15] mpi_array_distributed(ZN6thrust6detail11vector_baseIiNS_23device_malloc_allocatorIiEEEC2IPiEET_S7+0x3a) [0x806004a]

[16] mpi_array_distributed(ZN6thrust13device_vectorIiNS_23device_malloc_allocatorIiEEEC2IPiEET_S6+0x1f) [0x805fa63]

[17] mpi_array_distributed(run_kernel0+0x27) [0x805d407]

[18] mpi_array_distributed(main+0x54a) [0x805d35e]

[19] /lib/libc.so.6(__libc_start_main+0xe6) [0x59ece6]

[20] mpi_array_distributed() [0x805cd81]

*** End of error message ***

I am executing the program on cluster.

Can anyone please help ?

Thanks

hey… that is out-of-memory :)

if you make the math, you can easily see that your use of host memory forces the allocation of a similar amount of GPU memory… which hits the Gigabyte limit.

you should probably use thrust GPU memory vectors.

Hi,
The error message looks pretty clear to me: “out of memory”.
You try somehow to allocate more memory on the device than what you’ve got. Try to compile with “-G -g” to get more infos about the exact line in your code where this allocation fails.

I understood that it fails in memory but not sure how to solve it.

Am i using vectors incorrectly??

I am using device vector in code. Is there anything wrong?

Well, at least the code you quoted in your initial message cannot be the actual one since I doubt you’ll find a compiler that accept it.

Could you please publish the actual (or close enough) one along with the line where the allocation fails?

This is my Thrust code

extern "C"

	int run_kernel0( int array[], int nelements, int taskid, char hostname[])

	{

	        

	   float elapsedTime;int d_sum;

	   cudaEvent_t start, stop;

	   cudaEventCreate(&start);

	   cudaEventCreate(&stop);

	   cudaEventRecord(start, 0);

thrust::device_vector<int> gpuarray(array, array + nelements);

	   d_sum = thrust::reduce(gpuarray.begin(),gpuarray.end());

           cudaEventRecord(stop, 0);

	    cudaEventSynchronize(stop);     

	    cudaEventElapsedTime(&elapsedTime, start, stop);

	    cudaEventDestroy(start);

	    cudaEventDestroy(stop);

	      

	  printf(" Task %d has sum (on GPU): %ld Time for the kernel: %f ms \n", taskid, d_sum, elapsedTime); 

	  

	 return d_sum;

}

After compilation

nvcc -c kernel_distributed.cu -I /usr/local/NVIDIA_GPU_Computing_SDK/C/common/inc/ -G -g

/usr/local/cuda/bin/…/include/thrust/detail/backend/cuda/reduce.inl(90): Warning: Cannot tell what pointer points to, assuming global memory space

/usr/local/cuda/bin/…/include/thrust/detail/backend/cuda/reduce.inl(97): Warning: Cannot tell what pointer points to, assuming global memory space

/usr/local/cuda/bin/…/include/thrust/detail/backend/cuda/reduce.inl(142): Warning: Cannot tell what pointer points to, assuming global memory space

After execution

/usr/local/bin/mpirun -x LD_LIBRARY_PATH --mca btl tcp,self --mca btl_tcp_if_include eth0 --hostfile slaves -np 10 mpi_array_distributed

MPI task 0 has started

MPI task 2 has started

MPI task 6 has started

MPI task 3 has started

MPI task 8 has started

MPI task 4 has started

MPI task 9 has started

MPI task 5 has started

MPI task 7 has started

MPI task 1 has started

terminate called after throwing an instance of ‘thrust::system::detail::bad_alloc’

what(): std::bad_alloc: out of memory

*** Process received signal ***

Signal: Aborted (6)

Signal code: (-6)

[ 0] [0x38440c]

[ 1] /lib/libc.so.6(abort+0x17a) [0x13c3ca]

[ 2] /usr/lib/libstdc++.so.6(_ZN9__gnu_cxx27__verbose_terminate_handlerEv+0x167) [0x438327]

[ 3] /usr/lib/libstdc++.so.6(-0xff5b0e7a) [0x436186]

[ 4] /usr/lib/libstdc++.so.6(-0xff5b0e3d) [0x4361c3]

[ 5] /usr/lib/libstdc++.so.6(-0xff5b0cfe) [0x436302]

[ 6] mpi_array_distributed(_ZN6thrust6detail7backend4cuda6mallocILj0EEENS_10device_ptrIvEEj+0x17b) [0x8056801]

[ 7] mpi_array_distributed(_ZN6thrust6detail7backend8dispatch6mallocILj0EEENS_10device_ptrIvEEjNS0_21cuda_device_space_tagE+0x19) [0x805647a]

[ 8] mpi_array_distributed(_ZN6thrust13device_mallocEj+0x1d) [0x8055fcc]

[ 9] mpi_array_distributed(_ZN6thrust13device_mallocIiEENS_10device_ptrIT_EEj+0x23) [0x8057a7f]

[10] mpi_array_distributed(_ZN6thrust23device_malloc_allocatorIiE8allocateEjNS_10device_ptrIKiEE+0x5e) [0x8057690]

[11] mpi_array_distributed(_ZN6thrust6detail18contiguous_storageIiNS_23device_malloc_allocatorIiEEE8allocateEj+0x3f) [0x805710f]

[12] mpi_array_distributed(_ZN6thrust6detail11vector_baseIiNS_23device_malloc_allocatorIiEEE17allocate_and_copyIPiEEvjT_S7_RNS0_18contiguous_storageIiS3_EE+0x158) [0x8056f14]

[13] mpi_array_distributed(_ZN6thrust6detail11vector_baseIiNS_23device_malloc_allocatorIiEEE10range_initIPiEEvT_S7_NS0_17integral_constantIbLb0EEE+0x42) [0x8056c52]

[14] mpi_array_distributed(_ZN6thrust6detail11vector_baseIiNS_23device_malloc_allocatorIiEEE13init_dispatchIPiEEvT_S7_NS0_17integral_constantIbLb0EEE+0x23) [0x8056add]

[15] mpi_array_distributed(ZN6thrust6detail11vector_baseIiNS_23device_malloc_allocatorIiEEEC1IPiEET_S7+0x3a) [0x8056924]

[16] mpi_array_distributed(ZN6thrust13device_vectorIiNS_23device_malloc_allocatorIiEEEC2IPiEET_S6+0x1f) [0x80564cb]

[17] mpi_array_distributed(run_kernel0+0x50) [0x80544f0]

[18] mpi_array_distributed(main+0x4d6) [0x80543aa]

[19] /lib/libc.so.6(__libc_start_main+0xe6) [0x126ce6]

[20] mpi_array_distributed() [0x8053e41]

*** End of error message ***


mpirun noticed that process rank 8 with PID 6330 exited on signal 6 (Aborted).

Is that what u wanted to see?

From this is unclear what the actual problem is. I think you should first eliminate the obvious possible problem in the environment:
Just for the sake of my understanding, what are the GPUs you are running on? Do they indeed have enough of memory? Are you sure your various MPI processes don’t try to access the same GPU if there are several per compute node? (this last one would be my best bet)

I have tried code in which master task distributes the data to other MPI tasks and then each MPI task runs the kernel code.

That code works fine for larger arrays.

In this code i am just trying to eliminate distribution and assuming that each task will know its own data.

I have one GPU -GForce 9500 GT present on each machine of cluster. My cluster has 3 nodes including master. Master node executes 2 MPI tasks and rest two nodes execute 4 tasks each.

I have just changed my data distribution strategy and this time i am using thrust instead of raw cuda.

Yes tasks use same GPU for their calculation and it worked well when master task distributed the data.

GForce 9500 GT, this means you’ve got either 256MB, 512MB or 1GB of memory on the card.
Here, per process, you try to allocate ARRAYSIZEsizeof(int) Bytes of memory on the card per process. With ARRAYSIZE going over 20000000, the code fails, right?
But 20000000
4=80MB, for 4 MPI processes on your card you reach 320MB for the data only. I don’t know what the size of a CUDA context is and if Thrust induces some extra memory consumption compared to straight CUDA arrays, but depending on the memory capacity of your card, I might get close enough to the limit already.

Beside this, I’m not sure whether the way you initialise the device vector using just an host pointer is valid. I guess it is but since that’s not the way I use it myself, I’m not so sure… And the fact that you use a compute capability 1.0 (so without UVA giving a clear information that the memory is indeed host memory and not device memory) makes it even more suspicious to me. But that’s over the limit of my competence.

Yes the card only has 512 MB and compute capability is 1.1.

But then how come same code with master task distributing the data of array size 400000000 works properly?

so for 10 MPI tasks each task gets 40000000 integers and each task calculates the results correctly.

I use the same cards and i get correct sum. But the only difference is I use raw cuda code instead of thrust.

I also tried to initialize the device vector as

thrust::host_vector<int> host(nelements);

thrust::generate(host.begin(),host.end(),rand);

thrust::device_vector<int>device-array = host;

But i get the same error. :(

Not sure how can I overcome this. Its just different method of initializing data i guess.

The only thing I can come up with to try to explain why plain CUDA works and why Thrust doesn’t would be that your CUDA code is somehow monolithic (using continuously the device from the first memory allocation to its release) preventing from time-sharing the device, whereas Thrust allows for more gentle sharing of the device between the all processes concurrently…
But here again, that is over my limits for understanding how concurrency between processes is managed by CUDA…
The question is very interesting to me though, and I’d be glad if somebody could get to the bottom of this.

Here is the output for plain CUDA code and the MPI code is as mentioned in original post.

#include <stdio.h>

    #include <cutil_inline.h>

    #include <cutil.h> 

#include <thrust/version.h> 

    #include <thrust/generate.h> 

    #include <thrust/host_vector.h> 

    #include <thrust/device_vector.h> 

    #include <thrust/functional.h> 

    #include <thrust/transform_reduce.h>

    #include <time.h>

    #include <sys/time.h>

    #include <sys/resource.h>

#define BLOCK_NUM    8 

#define THREAD_NUM    256

__global__ static void sumOfSquares(int * num, int * result,int DATA_SIZE) 

{ 

    extern __shared__ int shared[]; 

    const int tid = threadIdx.x; 

    const int bid = blockIdx.x;

shared[tid] = 0; 

    for (int i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM) { 

        shared[tid] += num[i]; 

    } 

__syncthreads(); 

    int offset = THREAD_NUM / 2; 

    while (offset > 0) { 

        if (tid < offset) { 

            shared[tid] += shared[tid + offset]; 

        } 

        offset >>= 1; 

        __syncthreads(); 

    } 

if (tid == 0) { 

       result[bid] = shared[0]; 

} 

}

extern "C"

    int run_kernel0( int array[], int nelements, int taskid, char hostname[])

    {

int * gpudata, i;

     int * result; 

     clock_t * time;

     cudaEvent_t start, stop;

     cudaEventCreate(&start);

     cudaEventCreate(&stop);

     cudaEventRecord(start, 0);

cudaMalloc((void **) &gpudata, sizeof(int) * nelements); 

      cudaMalloc((void **) &result, sizeof(int) * THREAD_NUM * BLOCK_NUM);     

      cudaMemcpy(gpudata, array, sizeof(int) * nelements, cudaMemcpyHostToDevice);

      printf("\n MPI Task %d is executing Kernel function........", taskid); 

       int sum[BLOCK_NUM];

sumOfSquares<<<BLOCK_NUM, THREAD_NUM, THREAD_NUM * sizeof(int)>>>(gpudata, result,nelements);

cudaMemcpy(&sum, result, sizeof(int) * BLOCK_NUM, cudaMemcpyDeviceToHost); 

       //calculate sum of each block. 

      int final_sum = 0; 

    for (i = 0; i < BLOCK_NUM; i++) { 

        final_sum += sum[i]; 

      }

cudaEventRecord(stop, 0);

      cudaEventSynchronize(stop);

      float elapsedTime;

      cudaEventElapsedTime(&elapsedTime, start, stop);

      cudaEventDestroy(start);

      cudaEventDestroy(stop); 

      cudaFree(gpudata); 

      cudaFree(result);

printf(" Task %d has sum (on GPU): %ld Time for the kernel: %f ms \n", taskid, final_sum, elapsedTime);    

           return final_sum;

}

Error trace -

Error trace -

MPI task 0 has started on host

MPI task 1 has started on host

MPI task 2 has started on host

MPI task 3 has started on host

MPI task 4 has started on host

MPI task 6 has started on host

MPI task 7 has started on host

MPI task 8 has started on host

MPI task 9 has started on host

MPI task 5 has started on host

MPI Task 1 is executing Kernel function… Task 1 has sum (on GPU): 40000000 Time for the kernel: 120.534050 ms

MPI Task 0 is executing Kernel function… Task 0 has sum (on GPU): 40000000 Time for the kernel: 137.301315 ms

MPI Task 4 is executing Kernel function… Task 4 has sum (on GPU): 348456223 Time for the kernel: 0.000000 ms

MPI Task 7 is executing Kernel function… Task 7 has sum (on GPU): 353682719 Time for the kernel: 0.000000 ms

MPI Task 3 is executing Kernel function… Task 3 has sum (on GPU): 40000000 Time for the kernel: 4172.341309 ms

MPI Task 2 is executing Kernel function… Task 2 has sum (on GPU): 40000000 Time for the kernel: 4204.969727 ms

*** Process received signal ***

Signal: Segmentation fault (11)

Signal code: Address not mapped (1)

Failing at address: (nil)

[ 0] [0xd1340c]

[ 1] /usr/lib/libcuda.so(+0x163e12) [0x1092e12]

[ 2] /usr/lib/libcuda.so(+0x115749) [0x1044749]

[ 3] /usr/lib/libcuda.so(cuEventRecord+0x5c) [0x103578c]

[ 4] /usr/local/cuda/lib/libcudart.so.4(+0x2480f) [0x7fd80f]

[ 5] /usr/local/cuda/lib/libcudart.so.4(cudaEventRecord+0x22f) [0x838b8f]

[ 6] mpi_array_distributed(run_kernel0+0x32) [0x804a2b2]

[ 7] mpi_array_distributed(main+0x3ee) [0x804a0a2]

[ 8] /lib/libc.so.6(__libc_start_main+0xe6) [0x2fece6]

[ 9] mpi_array_distributed() [0x8049c21]

*** End of error message ***


mpirun noticed that process rank 5 with PID 6559 on node exited on signal 11 (Segmentation fault).

Task 4 and 7 give wrong sum and time too. And finally it fails in memory

Not sure what is causing this when each task initializes its own data.